mai_pi-33_zakharov/lab4_sandbox.ipynb at 75b0e0f5808bcb01a0e2627016f303a161a66b8a

Zakharov_Rostislav 75b0e0f580 feat(lab-4): r2 0,64

2024-12-12 23:48:52 +04:00

211 KiB

Raw Blame History

Загрузка набора данных¶

In [1]:

import pandas as pd
import matplotlib.pyplot as plt
import math


from pandas import DataFrame
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn import metrics
from sklearn import linear_model, tree, neighbors, ensemble

from sklearn.model_selection import train_test_split
from sklearn import set_config
from sklearn.pipeline import make_pipeline


df = pd.read_csv("../data/car_price_prediction.csv")

df = df.drop(columns=["ID"])

df

Out[1]:

	Price	Levy	Manufacturer	Model	Prod. year	Category	Leather interior	Fuel type	Engine volume	Mileage	Cylinders	Gear box type	Drive wheels	Doors	Wheel	Color	Airbags
0	13328	1399	LEXUS	RX 450	2010	Jeep	Yes	Hybrid	3.5	186005 km	6.0	Automatic	4x4	04-May	Left wheel	Silver	12
1	16621	1018	CHEVROLET	Equinox	2011	Jeep	No	Petrol	3	192000 km	6.0	Tiptronic	4x4	04-May	Left wheel	Black	8
2	8467	-	HONDA	FIT	2006	Hatchback	No	Petrol	1.3	200000 km	4.0	Variator	Front	04-May	Right-hand drive	Black	2
3	3607	862	FORD	Escape	2011	Jeep	Yes	Hybrid	2.5	168966 km	4.0	Automatic	4x4	04-May	Left wheel	White	0
4	11726	446	HONDA	FIT	2014	Hatchback	Yes	Petrol	1.3	91901 km	4.0	Automatic	Front	04-May	Left wheel	Silver	4
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
19232	8467	-	MERCEDES-BENZ	CLK 200	1999	Coupe	Yes	CNG	2.0 Turbo	300000 km	4.0	Manual	Rear	02-Mar	Left wheel	Silver	5
19233	15681	831	HYUNDAI	Sonata	2011	Sedan	Yes	Petrol	2.4	161600 km	4.0	Tiptronic	Front	04-May	Left wheel	Red	8
19234	26108	836	HYUNDAI	Tucson	2010	Jeep	Yes	Diesel	2	116365 km	4.0	Automatic	Front	04-May	Left wheel	Grey	4
19235	5331	1288	CHEVROLET	Captiva	2007	Jeep	Yes	Diesel	2	51258 km	4.0	Automatic	Front	04-May	Left wheel	Black	4
19236	470	753	HYUNDAI	Sonata	2012	Sedan	Yes	Hybrid	2.4	186923 km	4.0	Automatic	Front	04-May	Left wheel	White	12

19237 rows × 17 columns

Анализ датасета и очистка данных¶

In [2]:

df.dtypes

Out[2]:

Price                 int64
Levy                 object
Manufacturer         object
Model                object
Prod. year            int64
Category             object
Leather interior     object
Fuel type            object
Engine volume        object
Mileage              object
Cylinders           float64
Gear box type        object
Drive wheels         object
Doors                object
Wheel                object
Color                object
Airbags               int64
dtype: object

In [3]:

df["Engine volume"] = df["Engine volume"].str.replace("Turbo", "")
df["Engine volume"] = pd.to_numeric(df["Engine volume"])
df["Engine volume"].unique()

Out[3]:

array([ 3.5,  3. ,  1.3,  2.5,  2. ,  1.8,  2.4,  4. ,  1.6,  3.3,  2.2,
        4.7,  1.5,  4.4,  1.4,  3.6,  2.3,  5.5,  2.8,  3.2,  3.8,  4.6,
        1.2,  5. ,  1.7,  2.9,  0.5,  1.9,  2.7,  4.8,  5.3,  0.4,  1.1,
        2.1,  0.7,  5.4,  3.7,  1. ,  2.6,  0.8,  0.2,  5.7,  6.7,  6.2,
        3.4,  6.3,  4.3,  4.2,  0. , 20. ,  0.3,  5.9,  5.6,  6. ,  0.6,
        6.8,  4.5,  7.3,  0.1,  3.1,  6.4,  3.9,  0.9,  5.2,  5.8])

In [4]:

df["Mileage"] = df["Mileage"].str.replace("km", "")
df["Mileage"] = df["Mileage"].astype("int64")
df["Mileage"].unique()

Out[4]:

array([186005, 192000, 200000, ..., 140607, 307325, 186923])

In [5]:

df["Levy"] = df["Levy"].replace("-", "0")
df["Levy"] = df["Levy"].astype("int64")
df["Levy"].unique()

Out[5]:

array([ 1399,  1018,     0,   862,   446,   891,   761,   751,   394,
        1053,  1055,  1079,   810,  2386,  1850,   531,   586,  1249,
        2455,   583,  1537,  1288,   915,  1750,   707,  1077,  1486,
        1091,   650,   382,  1436,  1194,   503,  1017,  1104,   639,
         629,   919,   781,   530,   640,   765,   777,   779,   934,
         769,   645,  1185,  1324,   830,  1187,  1111,   760,   642,
        1604,  1095,   966,   473,  1138,  1811,   988,   917,  1156,
         687, 11714,   836,  1347,  2866,  1646,   259,   609,   697,
         585,   475,   690,   308,  1823,  1361,  1273,   924,   584,
        2078,   831,  1172,   893,  1872,  1885,  1266,   447,  2148,
        1730,   730,   289,   502,   333,  1325,   247,   879,  1342,
        1327,  1598,  1514,  1058,   738,  1935,   481,  1522,  1282,
         456,   880,   900,   798,  1277,   442,  1051,   790,  1292,
        1047,   528,  1211,  1493,  1793,   574,   930,  1998,   271,
         706,  1481,  1677,  1661,  1286,  1408,  1090,   595,  1451,
        1267,   993,  1714,   878,   641,   749,  1511,   603,   353,
         877,  1236,  1141,   397,   784,  1024,  1357,  1301,   770,
         922,  1438,   753,   607,  1363,   638,   490,   431,   565,
         517,   833,   489,  1760,   986,  1841,  1620,  1360,   474,
        1099,   978,  1624,  1946,  1268,  1307,   696,   649,   666,
        2151,   551,   800,   971,  1323,  2377,  1845,  1083,   694,
         463,   419,   345,  1515,  1505,  2056,  1203,   729,   460,
        1356,   876,   911,  1190,   780,   448,  2410,  1848,  1148,
         834,  1275,  1028,  1197,   724,   890,  1705,   505,   789,
        2959,   518,   461,  1719,  2858,  3156,  2225,  2177,  1968,
        1888,  1308,  2736,  1103,   557,  2195,   843,  1664,   723,
        4508,   562,   501,  2018,  1076,  1202,  3301,   691,  1440,
        1869,  1178,   418,  1820,  1413,   488,  1304,   363,  2108,
         521,  1659,    87,  1411,  1528,  3292,  7058,  1578,   627,
         874,  1996,  1488,  5679,  1234,  5603,   400,   889,  3268,
         875,   949,  2265,   441,   742,   425,  2476,  2971,   614,
        1816,  1375,  1405,  2297,  1062,  1113,   420,  2469,   658,
        1951,  2670,  2578,  1995,  1032,   994,  1011,  2421,  1296,
         155,   494,   426,  1086,   961,  2236,  1829,   764,  1834,
        1054,   617,  1529,  2266,   637,   626,  1832,  1016,  2002,
        1756,   746,  1285,  2690,  1118,  5332,   980,  1807,   970,
        1228,  1195,  1132,  1768,  1384,  1080,  7063,  1817,  1452,
        1975,  1368,   702,  1974,  1781,  1036,   944,   663,   364,
        1539,  1345,  1680,  2209,   741,  1575,   695,  1317,   294,
        1525,   424,   997,  1473,  1552,  2819,  2188,  1668,  3057,
         799,  1502,  2606,   552,  1694,  1759,  1110,   399,  1470,
        1174,  5877,  1474,  1688,   526,   686,  5908,  1107,  2070,
        1468,  1246,  1685,   556,  1533,  1917,  1346,   732,   692,
         579,   421,   362,  3505,  1855,  2711,  1586,  3739,   681,
        1708,  2278,  1701,   722,  1482,   928,   827,   832,   527,
         604,   173,  1341,  3329,  1553,   859,   167,   916,   828,
        2082,  1176,  1108,   975,  3008,  1516,  2269,  1699,  2073,
        1031,  1503,  2364,  1030,  1442,  5666,  2715,  1437,  2067,
        1426,  2908,  1279,   866,  4283,   279,  2658,  3015,  2004,
        1391,  4736,   748,  1466,   644,   683,  2705,  1297,   731,
        1252,  2216,  3141,  3273,  1518,  1723,  1588,   972,   682,
        1094,   668,   175,   967,   402,  3894,  1960,  1599,  2000,
        2084,  1621,   714,  1109,  3989,   873,  1572,  1163,  1991,
        1716,  1673,  2562,  2874,   965,   462,   605,  1948,  1736,
        3518,  2054,  2467,  1681,  1272,  1205,   750,  2156,  2566,
         115,   524,  3184,   676,  1678,   612,   328,   955,  1441,
        1675,  3965,  2909,   623,   822,   867,  3025,  1993,   792,
         636,  4057,  3743,  2337,  2570,  2418,  2472,  3910,  1662,
        2123,  2628,  3208,  2080,  3699,  2913,   864,  2505,   870,
        7536,  1924,  1671,  1064,  1836,  1866,  4741,   841,  1369,
        5681,  3112,  1366,  2223,  1198,  1039,  3811,  3571,  1387,
        1171,  1365,  1531,  1590, 11706,  2308,  4860,  1641,  1045,
        1901])

In [6]:

df["Cylinders"] = df["Cylinders"].astype("int64")
df["Cylinders"].unique()

Out[6]:

array([ 6,  4,  8,  1, 12,  3,  2, 16,  5,  7,  9, 10, 14])

In [7]:

df["Doors"].unique()

Out[7]:

array(['04-May', '02-Mar', '>5'], dtype=object)

In [8]:

df["Doors"] = df["Doors"].map(
    {"02-Mar": "Двухдверный", "04-May": "Четырехдверный", ">5": "Многодверный"}
)
df["Doors"].unique()

Out[8]:

array(['Четырехдверный', 'Двухдверный', 'Многодверный'], dtype=object)

In [9]:

sorted_df = df.sort_values(by="Price")
sorted_df["Price"].unique()

Out[9]:

array([       1,        3,        6, ...,   627220,   872946, 26307500])

In [10]:

print(f"Количество строк до удаления некорректных значений: {len(df)}")
df = df[df["Price"] >= 500]
print(f"Количество строк после удаления некорректных значений: {len(df)}")

Количество строк до удаления некорректных значений: 19237
Количество строк после удаления некорректных значений: 17574

In [11]:

sorted_df = df.sort_values(by="Price")
sorted_df["Price"].unique()

Out[11]:

array([     500,      549,      600, ...,   627220,   872946, 26307500])

In [12]:

sorted_df = df.sort_values(by="Prod. year")
sorted_df["Prod. year"].unique()

Out[12]:

array([1943, 1953, 1957, 1964, 1965, 1968, 1973, 1974, 1977, 1978, 1980,
       1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991,
       1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
       2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013,
       2014, 2015, 2016, 2017, 2018, 2019, 2020])

In [13]:

df

Out[13]:

	Price	Levy	Manufacturer	Model	Prod. year	Category	Leather interior	Fuel type	Engine volume	Mileage	Cylinders	Gear box type	Drive wheels	Doors	Wheel	Color	Airbags
0	13328	1399	LEXUS	RX 450	2010	Jeep	Yes	Hybrid	3.5	186005	6	Automatic	4x4	Четырехдверный	Left wheel	Silver	12
1	16621	1018	CHEVROLET	Equinox	2011	Jeep	No	Petrol	3.0	192000	6	Tiptronic	4x4	Четырехдверный	Left wheel	Black	8
2	8467	0	HONDA	FIT	2006	Hatchback	No	Petrol	1.3	200000	4	Variator	Front	Четырехдверный	Right-hand drive	Black	2
3	3607	862	FORD	Escape	2011	Jeep	Yes	Hybrid	2.5	168966	4	Automatic	4x4	Четырехдверный	Left wheel	White	0
4	11726	446	HONDA	FIT	2014	Hatchback	Yes	Petrol	1.3	91901	4	Automatic	Front	Четырехдверный	Left wheel	Silver	4
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
19231	5802	1055	MERCEDES-BENZ	E 350	2013	Sedan	Yes	Diesel	3.5	107800	6	Automatic	Rear	Четырехдверный	Left wheel	Grey	12
19232	8467	0	MERCEDES-BENZ	CLK 200	1999	Coupe	Yes	CNG	2.0	300000	4	Manual	Rear	Двухдверный	Left wheel	Silver	5
19233	15681	831	HYUNDAI	Sonata	2011	Sedan	Yes	Petrol	2.4	161600	4	Tiptronic	Front	Четырехдверный	Left wheel	Red	8
19234	26108	836	HYUNDAI	Tucson	2010	Jeep	Yes	Diesel	2.0	116365	4	Automatic	Front	Четырехдверный	Left wheel	Grey	4
19235	5331	1288	CHEVROLET	Captiva	2007	Jeep	Yes	Diesel	2.0	51258	4	Automatic	Front	Четырехдверный	Left wheel	Black	4

17574 rows × 17 columns

Очистка дубликатов и пропущенных значений¶

In [14]:

df.duplicated().sum()

Out[14]:

np.int64(2773)

In [15]:

df.drop_duplicates(inplace=True)

C:\Users\user\AppData\Local\Temp\ipykernel_27528\3006716147.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop_duplicates(inplace=True)

In [16]:

df.isna().sum()

Out[16]:

Price               0
Levy                0
Manufacturer        0
Model               0
Prod. year          0
Category            0
Leather interior    0
Fuel type           0
Engine volume       0
Mileage             0
Cylinders           0
Gear box type       0
Drive wheels        0
Doors               0
Wheel               0
Color               0
Airbags             0
dtype: int64

Очистка выбросов¶

In [17]:

df.dtypes

Out[17]:

Price                 int64
Levy                  int64
Manufacturer         object
Model                object
Prod. year            int64
Category             object
Leather interior     object
Fuel type            object
Engine volume       float64
Mileage               int64
Cylinders             int64
Gear box type        object
Drive wheels         object
Doors                object
Wheel                object
Color                object
Airbags               int64
dtype: object

In [18]:

numeric_features_with_outliers = [
    "Price",
    "Levy",
    "Mileage",
    "Prod. year",
]

i = 1
for col in numeric_features_with_outliers:
    plt.figure(figsize=(4, 30))
    plt.subplot(6, 1, i)
    df.boxplot(column=col)
    i += 1

No description has been provided for this image

In [19]:

def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

print(f"Количество строк до удаления выбросов: {len(df)}")

for column in numeric_features_with_outliers:
    df = remove_outliers(df, column)

print(f"Количество строк после удаления выбросов: {len(df)}")

Количество строк до удаления выбросов: 14801
Количество строк после удаления выбросов: 12597

In [20]:

i = 1
for col in numeric_features_with_outliers:
    plt.figure(figsize=(4, 30))
    plt.subplot(6, 1, i)
    df.boxplot(column=col)
    i += 1

Разбиение на выборки¶

In [21]:

X = df
y = df["Category"]

train_df, test_df, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

print("Размеры выборок:")
print(f"Обучающая выборка: {train_df.shape[0]} записей")
print(train_df.Category.value_counts())
print(f"Тестовая выборка: {test_df.shape[0]} записей")
print(test_df.Category.value_counts())

Размеры выборок:
Обучающая выборка: 8817 записей
Category
Sedan          3954
Jeep           2263
Hatchback      1554
Minivan         312
Coupe           251
Universal       180
Microbus        143
Goods wagon     120
Pickup           22
Cabriolet        16
Limousine         2
Name: count, dtype: int64
Тестовая выборка: 3780 записей
Category
Sedan          1692
Jeep            990
Hatchback       636
Minivan         151
Coupe           117
Universal        82
Goods wagon      52
Microbus         46
Pickup            8
Cabriolet         5
Limousine         1
Name: count, dtype: int64

Oversampling¶

In [22]:

def oversample(df):
    X = df.drop("Category", axis=1)
    y = df["Category"]

    oversampler = RandomOverSampler(random_state=42)
    X_resampled, y_resampled = oversampler.fit_resample(X, y)  # type: ignore

    resampled_df = pd.concat([X_resampled, y_resampled], axis=1)
    return resampled_df


train_df_overs = oversample(train_df)
test_df_overs = oversample(test_df)

print("Размеры выборок:")
print(f"Обучающая выборка: {train_df_overs.shape[0]} записей")
print(train_df_overs.Category.value_counts())
print(f"Тестовая выборка: {test_df_overs.shape[0]} записей")
print(test_df_overs.Category.value_counts())

Размеры выборок:
Обучающая выборка: 43494 записей
Category
Sedan          3954
Jeep           3954
Universal      3954
Hatchback      3954
Coupe          3954
Goods wagon    3954
Minivan        3954
Microbus       3954
Pickup         3954
Limousine      3954
Cabriolet      3954
Name: count, dtype: int64
Тестовая выборка: 18612 записей
Category
Hatchback      1692
Sedan          1692
Universal      1692
Jeep           1692
Coupe          1692
Minivan        1692
Goods wagon    1692
Microbus       1692
Pickup         1692
Cabriolet      1692
Limousine      1692
Name: count, dtype: int64

In [23]:

price_y_train = train_df["Price"]
price_y_test = test_df["Price"]
train_df = train_df.drop(columns=["Price"])
test_df = test_df.drop(columns=["Price"])

Ручной синтез признаков.¶

In [24]:

def age_create(df): 
    df["Age"] = 2020 - df["Prod. year"]
    df = df.drop(columns=["Prod. year"])
    return df

train_df = age_create(train_df)
test_df = age_create(test_df)

In [25]:

sorted_df = train_df.sort_values(by="Age")
sorted_df["Age"].unique()

Out[25]:

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21])

In [26]:

plt.figure(figsize=(4, 30))
plt.subplot(6, 1, i)
train_df.boxplot(column="Age")

Out[26]:

<Axes: >

Унитарное кодирование категориальных признаков¶

In [27]:

train_df.dtypes

Out[27]:

Levy                  int64
Manufacturer         object
Model                object
Category             object
Leather interior     object
Fuel type            object
Engine volume       float64
Mileage               int64
Cylinders             int64
Gear box type        object
Drive wheels         object
Doors                object
Wheel                object
Color                object
Airbags               int64
Age                   int64
dtype: object

In [28]:

categorical_features_for_encoding = [
    "Leather interior",
    "Category",
    "Fuel type",
    "Gear box type",
    "Drive wheels",
    "Doors",
    "Wheel",
]

def onehot_encode(df):
    encoder = OneHotEncoder(sparse_output=False, drop="first")
    temp = encoder.fit_transform(df[categorical_features_for_encoding])
    df = df.drop(columns=categorical_features_for_encoding)
    return temp


temp = onehot_encode(train_df)
temp
# onehot_encode(test_df)

# train_df

Out[28]:

array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 1.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [1., 0., 0., ..., 0., 1., 0.]])

Масштабирование признаков¶

In [29]:

train_df.dtypes

Out[29]:

Levy                  int64
Manufacturer         object
Model                object
Category             object
Leather interior     object
Fuel type            object
Engine volume       float64
Mileage               int64
Cylinders             int64
Gear box type        object
Drive wheels         object
Doors                object
Wheel                object
Color                object
Airbags               int64
Age                   int64
dtype: object

In [30]:

scaler = StandardScaler()

numeric_features_for_stardartization = [
    "Levy",
    "Engine volume",
    "Mileage",
    "Cylinders",
    "Airbags",
    "Age",
]

train_df[numeric_features_for_stardartization] = scaler.fit_transform(
    train_df[numeric_features_for_stardartization]
)
test_df[numeric_features_for_stardartization] = scaler.transform(
    test_df[numeric_features_for_stardartization]
)

train_df

Out[30]:

	Levy	Manufacturer	Model	Category	Leather interior	Fuel type	Engine volume	Mileage	Cylinders	Gear box type	Drive wheels	Doors	Wheel	Color	Airbags	Age
15146	-1.192982	TOYOTA	Prius	Sedan	No	Petrol	-0.479341	-1.531744	-0.403213	Automatic	Front	Четырехдверный	Left wheel	White	-0.683755	-1.946936
14145	-1.192982	TOYOTA	Ractis	Sedan	No	Petrol	-0.887855	-0.130245	-0.403213	Tiptronic	Front	Четырехдверный	Right-hand drive	Brown	-1.190217	0.879266
8943	0.081576	HYUNDAI	Elantra	Sedan	No	Petrol	-0.479341	-0.651122	-0.403213	Tiptronic	Front	Четырехдверный	Left wheel	Beige	0.835631	-0.642535
17889	-1.192982	SUBARU	Forester L.L.BEAN	Jeep	Yes	CNG	0.473858	1.263152	-0.403213	Automatic	4x4	Четырехдверный	Left wheel	Green	-0.430524	1.531466
9515	1.703146	HYUNDAI	H1	Universal	Yes	Diesel	0.473858	-0.739330	-0.403213	Automatic	Front	Четырехдверный	Left wheel	Grey	-0.683755	-1.512135
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
18201	-1.192982	AUDI	A4	Sedan	Yes	Petrol	0.337687	0.318018	1.538421	Manual	4x4	Четырехдверный	Left wheel	Grey	-0.683755	0.879266
7436	0.476602	KIA	Avella	Sedan	Yes	Petrol	-0.206998	-0.011145	-0.403213	Automatic	Front	Четырехдверный	Left wheel	Silver	1.342092	-0.859935
7728	0.646834	TOYOTA	Prius	Hatchback	No	Hybrid	-0.887855	1.155137	-0.403213	Variator	Front	Четырехдверный	Left wheel	Silver	0.329169	0.661866
1136	-1.192982	FORD	Fiesta	Sedan	No	Petrol	-0.751684	-0.697325	-0.403213	Automatic	Front	Четырехдверный	Left wheel	Silver	0.329169	-1.077335
10640	0.446048	KIA	Optima EX	Sedan	Yes	Petrol	0.337687	-0.465093	7.363324	Tiptronic	Front	Четырехдверный	Left wheel	Silver	0.329169	-0.425135

8817 rows × 16 columns

Числовое кодирование категориальных признаков¶

In [31]:

train_df.dtypes

Out[31]:

Levy                float64
Manufacturer         object
Model                object
Category             object
Leather interior     object
Fuel type            object
Engine volume       float64
Mileage             float64
Cylinders           float64
Gear box type        object
Drive wheels         object
Doors                object
Wheel                object
Color                object
Airbags             float64
Age                 float64
dtype: object

In [32]:

categorical_features_for_encoding = [
    "Model",
    "Manufacturer",
    "Color",
    "Leather interior",
    "Category",
    "Fuel type",
    "Gear box type",
    "Drive wheels",
    "Doors",
    "Wheel",
]
def label_encode(df):
    encoder = LabelEncoder()
    for column in df.columns:
        if column in categorical_features_for_encoding:
            df[column] = encoder.fit_transform(df[column])

label_encode(train_df)
label_encode(test_df)

train_df

Out[32]:

	Levy	Manufacturer	Model	Category	Leather interior	Fuel type	Engine volume	Mileage	Cylinders	Gear box type	Drive wheels	Doors	Wheel	Color	Airbags	Age
15146	-1.192982	47	798	9	0	5	-0.479341	-1.531744	-0.403213	0	1	2	0	14	-0.683755	-1.946936
14145	-1.192982	47	864	9	0	5	-0.887855	-0.130245	-0.403213	2	1	2	1	3	-1.190217	0.879266
8943	0.081576	17	420	9	0	5	-0.479341	-0.651122	-0.403213	2	1	2	0	0	0.835631	-0.642535
17889	-1.192982	45	488	4	1	0	0.473858	1.263152	-0.403213	0	0	2	0	6	-0.430524	1.531466
9515	1.703146	17	549	10	1	1	0.473858	-0.739330	-0.403213	0	1	2	0	7	-0.683755	-1.512135
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
18201	-1.192982	2	121	9	1	5	0.337687	0.318018	1.538421	1	0	2	0	7	-0.683755	0.879266
7436	0.476602	22	178	9	1	5	-0.206998	-0.011145	-0.403213	0	1	2	0	12	1.342092	-0.859935
7728	0.646834	47	798	3	0	2	-0.887855	1.155137	-0.403213	3	1	2	0	12	0.329169	0.661866
1136	-1.192982	13	474	9	0	5	-0.751684	-0.697325	-0.403213	0	1	2	0	12	0.329169	-1.077335
10640	0.446048	22	743	9	1	5	0.337687	-0.465093	7.363324	2	1	2	0	12	0.329169	-0.425135

8817 rows × 16 columns

Формирование набора моделей¶

In [33]:

train_df.dtypes

Out[33]:

Levy                float64
Manufacturer          int64
Model                 int64
Category              int64
Leather interior      int64
Fuel type             int64
Engine volume       float64
Mileage             float64
Cylinders           float64
Gear box type         int64
Drive wheels          int64
Doors                 int64
Wheel                 int64
Color                 int64
Airbags             float64
Age                 float64
dtype: object

In [34]:

test_df.dtypes

Out[34]:

Levy                float64
Manufacturer          int64
Model                 int64
Category              int64
Leather interior      int64
Fuel type             int64
Engine volume       float64
Mileage             float64
Cylinders           float64
Gear box type         int64
Drive wheels          int64
Doors                 int64
Wheel                 int64
Color                 int64
Airbags             float64
Age                 float64
dtype: object

In [35]:

random_state = 9

set_config(transform_output="pandas")

models = {
    "linear": {"model": linear_model.LinearRegression(n_jobs=-1)},
    "linear_poly": {
        "model": make_pipeline(
            PolynomialFeatures(degree=2),
            linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),
        )
    },
    "linear_interact": {
        "model": make_pipeline(
            PolynomialFeatures(interaction_only=True),
            linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),
        )
    },
    "ridge": {"model": linear_model.RidgeCV()},
    "decision_tree": {
        "model": tree.DecisionTreeRegressor(max_depth=7, random_state=random_state)
    },
    "knn": {"model": neighbors.KNeighborsRegressor(n_neighbors=7, n_jobs=-1)},
    "random_forest": {
        "model": ensemble.RandomForestRegressor(
            max_depth=7, random_state=random_state, n_jobs=-1
        )
    },
}

Обучение и оценка моделей с помощью различных алгоритмов¶

In [36]:

for model_name in models.keys():
    print(f"Model: {model_name}")

    model = models[model_name]["model"]

    fitted_model = model.fit(train_df.values, price_y_train.values.ravel())

    y_train_pred = fitted_model.predict(train_df)
    y_test_pred = fitted_model.predict(test_df)

    models[model_name]["fitted"] = fitted_model
    models[model_name]["train_preds"] = y_train_pred
    models[model_name]["preds"] = y_test_pred

    models[model_name]["RMSE_train"] = math.sqrt(
        metrics.mean_squared_error(price_y_train, y_train_pred)
    )
    models[model_name]["RMSE_test"] = math.sqrt(
        metrics.mean_squared_error(price_y_test, y_test_pred)
    )
    models[model_name]["RMAE_test"] = math.sqrt(
        metrics.mean_absolute_error(price_y_test, y_test_pred)
    )
    models[model_name]["R2_test"] = metrics.r2_score(price_y_test, y_test_pred)

Model: linear
Model: linear_poly

c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\sklearn\base.py:486: UserWarning: X has feature names, but LinearRegression was fitted without feature names
  warnings.warn(
c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\sklearn\base.py:486: UserWarning: X has feature names, but LinearRegression was fitted without feature names
  warnings.warn(
c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\sklearn\base.py:486: UserWarning: X has feature names, but PolynomialFeatures was fitted without feature names
  warnings.warn(
c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\sklearn\base.py:486: UserWarning: X has feature names, but PolynomialFeatures was fitted without feature names
  warnings.warn(

Model: linear_interact

c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\sklearn\base.py:486: UserWarning: X has feature names, but PolynomialFeatures was fitted without feature names
  warnings.warn(
c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\sklearn\base.py:486: UserWarning: X has feature names, but PolynomialFeatures was fitted without feature names
  warnings.warn(
c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\sklearn\base.py:486: UserWarning: X has feature names, but RidgeCV was fitted without feature names
  warnings.warn(
c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\sklearn\base.py:486: UserWarning: X has feature names, but RidgeCV was fitted without feature names
  warnings.warn(
c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\sklearn\base.py:486: UserWarning: X has feature names, but DecisionTreeRegressor was fitted without feature names
  warnings.warn(
c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\sklearn\base.py:486: UserWarning: X has feature names, but DecisionTreeRegressor was fitted without feature names
  warnings.warn(
c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\sklearn\base.py:486: UserWarning: X has feature names, but KNeighborsRegressor was fitted without feature names
  warnings.warn(

Model: ridge
Model: decision_tree
Model: knn

c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\sklearn\base.py:486: UserWarning: X has feature names, but KNeighborsRegressor was fitted without feature names
  warnings.warn(

Model: random_forest

c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\sklearn\base.py:486: UserWarning: X has feature names, but RandomForestRegressor was fitted without feature names
  warnings.warn(
c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\sklearn\base.py:486: UserWarning: X has feature names, but RandomForestRegressor was fitted without feature names
  warnings.warn(

Вывод результатов оценки¶

In [37]:

reg_metrics = pd.DataFrame.from_dict(models, "index")[
    ["RMSE_train", "RMSE_test", "RMAE_test", "R2_test"]
]
reg_metrics.sort_values(by="RMSE_test").style.background_gradient(
    cmap="viridis", low=1, high=0.3, subset=["RMSE_train", "RMSE_test"]
).background_gradient(cmap="plasma", low=0.3, high=1, subset=["RMAE_test", "R2_test"])

Out[37]:

	RMSE_train	RMSE_test	RMAE_test	R2_test
random_forest	6319.840151	6857.366530	70.171960	0.604144
decision_tree	6865.806115	7608.471911	73.750577	0.512676
linear_poly	7462.272004	7912.709094	76.673024	0.472924
linear_interact	7853.459094	8132.731304	77.817116	0.443204
ridge	8971.394401	9047.309014	83.353034	0.310932
linear	8971.373140	9047.472417	83.360540	0.310908
knn	7044.273266	12172.932806	97.028762	-0.247422

211 KiB Raw Blame History Unescape Escape