mai_pi-33_zakharov/notebooks/lab4_pipeline.ipynb

48 KiB
Raw Blame History

Загрузка набора данных

In [69]:
import pandas as pd
import numpy as np
import math

from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn import metrics
from sklearn import linear_model, tree, neighbors, ensemble

from sklearn.model_selection import train_test_split
from sklearn import set_config
from sklearn.pipeline import make_pipeline

from transformers import CarsFeatures


set_config(transform_output="pandas")

random_state = 9

df = pd.read_csv("../data/car-price-prediction.csv")

df = df.drop(columns=["Unnamed: 0"])

df
Out[69]:
Price Levy Manufacturer Model Prod. year Category Leather interior Fuel type Engine volume Mileage Cylinders Gear box type Drive wheels Doors Wheel Color Airbags
0 13328 1399 LEXUS RX 450 2010 Jeep Yes Hybrid 3.5 186005 6 Automatic 4x4 Четырехдверный Left wheel Silver 12
1 16621 1018 CHEVROLET Equinox 2011 Jeep No Petrol 3.0 192000 6 Tiptronic 4x4 Четырехдверный Left wheel Black 8
2 8467 0 HONDA FIT 2006 Hatchback No Petrol 1.3 200000 4 Variator Front Четырехдверный Right-hand drive Black 2
3 3607 862 FORD Escape 2011 Jeep Yes Hybrid 2.5 168966 4 Automatic 4x4 Четырехдверный Left wheel White 0
4 11726 446 HONDA FIT 2014 Hatchback Yes Petrol 1.3 91901 4 Automatic Front Четырехдверный Left wheel Silver 4
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
12592 8781 1107 OPEL Combo 2007 Goods wagon No Diesel 1.7 236000 4 Manual Front Четырехдверный Left wheel Beige 4
12593 7840 0 NISSAN Skyline 2003 Sedan Yes Petrol 3.0 220000 6 Tiptronic Rear Четырехдверный Right-hand drive White 0
12594 8467 0 MERCEDES-BENZ CLK 200 1999 Coupe Yes CNG 2.0 300000 4 Manual Rear Двухдверный Left wheel Silver 5
12595 15681 831 HYUNDAI Sonata 2011 Sedan Yes Petrol 2.4 161600 4 Tiptronic Front Четырехдверный Left wheel Red 8
12596 26108 836 HYUNDAI Tucson 2010 Jeep Yes Diesel 2.0 116365 4 Automatic Front Четырехдверный Left wheel Grey 4

12597 rows × 17 columns

In [70]:
df.dtypes
Out[70]:
Price                 int64
Levy                  int64
Manufacturer         object
Model                object
Prod. year            int64
Category             object
Leather interior     object
Fuel type            object
Engine volume       float64
Mileage               int64
Cylinders             int64
Gear box type        object
Drive wheels         object
Doors                object
Wheel                object
Color                object
Airbags               int64
dtype: object

Разбиение на выборки

In [71]:
X = df
y = df["Category"]

train_df, test_df, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

print("Размеры выборок:")
print(f"Обучающая выборка: {train_df.shape[0]} записей")
print(train_df.Category.value_counts())
print(f"Тестовая выборка: {test_df.shape[0]} записей")
print(test_df.Category.value_counts())
Размеры выборок:
Обучающая выборка: 8817 записей
Category
Sedan          3954
Jeep           2263
Hatchback      1554
Minivan         312
Coupe           251
Universal       180
Microbus        143
Goods wagon     120
Pickup           22
Cabriolet        16
Limousine         2
Name: count, dtype: int64
Тестовая выборка: 3780 записей
Category
Sedan          1692
Jeep            990
Hatchback       636
Minivan         151
Coupe           117
Universal        82
Goods wagon      52
Microbus         46
Pickup            8
Cabriolet         5
Limousine         1
Name: count, dtype: int64

Oversampling

In [72]:
def oversample(df):
    X = df.drop("Category", axis=1)
    y = df["Category"]

    oversampler = RandomOverSampler(random_state=42)
    X_resampled, y_resampled = oversampler.fit_resample(X, y)  # type: ignore

    resampled_df = pd.concat([X_resampled, y_resampled], axis=1)
    return resampled_df


train_df_overs = oversample(train_df)
test_df_overs = oversample(test_df)

print("Размеры выборок:")
print(f"Обучающая выборка: {train_df_overs.shape[0]} записей")
print(train_df_overs.Category.value_counts())
print(f"Тестовая выборка: {test_df_overs.shape[0]} записей")
print(test_df_overs.Category.value_counts())
Размеры выборок:
Обучающая выборка: 43494 записей
Category
Sedan          3954
Jeep           3954
Universal      3954
Hatchback      3954
Coupe          3954
Goods wagon    3954
Minivan        3954
Microbus       3954
Pickup         3954
Limousine      3954
Cabriolet      3954
Name: count, dtype: int64
Тестовая выборка: 18612 записей
Category
Hatchback      1692
Sedan          1692
Universal      1692
Jeep           1692
Coupe          1692
Minivan        1692
Goods wagon    1692
Microbus       1692
Pickup         1692
Cabriolet      1692
Limousine      1692
Name: count, dtype: int64
In [73]:
price_y_train = train_df["Price"]
price_y_test = test_df["Price"]

Формирование конвейера для классификации данных

In [74]:
columns_to_drop = ["Price", "Color", "Model", "Manufacturer"]

num_columns = [
    column
    for column in df.columns
        if column not in columns_to_drop and df[column].dtype != "object"
]

cat_columns = [
    column
    for column in df.columns
        if column not in columns_to_drop and df[column].dtype == "object"
]

num_imputer = SimpleImputer(strategy="median")
num_scaler = StandardScaler()
preprocessing_num = Pipeline(
    [
        ("imputer", num_imputer),
        ("scaler", num_scaler),
    ]
)

cat_imputer = SimpleImputer(strategy="constant", fill_value="unknown")
cat_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False, drop="first")
preprocessing_cat = Pipeline(
    [
        ("imputer", cat_imputer),
        ("encoder", cat_encoder),
    ]
)

features_preprocessing = ColumnTransformer(
    verbose_feature_names_out=False,
    transformers=[
        ("prepocessing_num", preprocessing_num, num_columns),
        ("prepocessing_cat", preprocessing_cat, cat_columns),
        # ("prepocessing_features", num_imputer, ["Prod. year"]),
    ],
    remainder="passthrough",
)

features_engineering = ColumnTransformer(
    verbose_feature_names_out=False,
    transformers=[
        ("add_features", CarsFeatures(), ["Prod. year"]),
    ],
    remainder="passthrough",
)

drop_columns = ColumnTransformer(
    verbose_feature_names_out=False,
    transformers=[
        ("drop_columns", "drop", columns_to_drop),
    ],
    remainder="passthrough",
)

features_postprocessing = ColumnTransformer(
    verbose_feature_names_out=False,
    transformers=[
        ("prepocessing_num", preprocessing_num, ["Age"]),
    ],
    remainder="passthrough",
)


pipeline_end = Pipeline(
    [
        ("features_preprocessing", features_preprocessing),
        ("drop_columns", drop_columns),
    ]
)

Демонстрация работы конвейера для предобработки данных

In [75]:
preprocessing_result = pipeline_end.fit_transform(train_df)
preprocessed_df = pd.DataFrame(
    preprocessing_result,
    columns=pipeline_end.get_feature_names_out(),
)

preprocessed_df
Out[75]:
Levy Prod. year Engine volume Mileage Cylinders Airbags Category_Coupe Category_Goods wagon Category_Hatchback Category_Jeep ... Fuel type_Petrol Fuel type_Plug-in Hybrid Gear box type_Manual Gear box type_Tiptronic Gear box type_Variator Drive wheels_Front Drive wheels_Rear Doors_Многодверный Doors_Четырехдверный Wheel_Right-hand drive
10083 -1.192982 1.946936 -0.479341 -1.531744 -0.403213 -0.683755 0.0 0.0 0.0 0.0 ... 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0
9482 -1.192982 -0.879266 -0.887855 -0.130245 -0.403213 -1.190217 0.0 0.0 0.0 0.0 ... 1.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 1.0 1.0
6177 0.081576 0.642535 -0.479341 -0.651122 -0.403213 0.835631 0.0 0.0 0.0 0.0 ... 1.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 1.0 0.0
11756 -1.192982 -1.531466 0.473858 1.263152 -0.403213 -0.430524 0.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
6557 1.703146 1.512135 0.473858 -0.739330 -0.403213 -0.683755 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
11964 -1.192982 -0.879266 0.337687 0.318018 1.538421 -0.683755 0.0 0.0 0.0 0.0 ... 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
5191 0.476602 0.859935 -0.206998 -0.011145 -0.403213 1.342092 0.0 0.0 0.0 0.0 ... 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0
5390 0.646834 -0.661866 -0.887855 1.155137 -0.403213 0.329169 0.0 0.0 1.0 0.0 ... 0.0 0.0 0.0 0.0 1.0 1.0 0.0 0.0 1.0 0.0
860 -1.192982 1.077335 -0.751684 -0.697325 -0.403213 0.329169 0.0 0.0 0.0 0.0 ... 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0
7270 0.446048 0.425135 0.337687 -0.465093 7.363324 0.329169 0.0 0.0 0.0 0.0 ... 1.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 1.0 0.0

8817 rows × 31 columns

Формирование набора моделей

In [76]:
models = {
    "linear": {"model": linear_model.LinearRegression(n_jobs=-1)},
    "linear_poly": {
        "model": make_pipeline(
            PolynomialFeatures(degree=2),
            linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),
        )
    },
    "linear_interact": {
        "model": make_pipeline(
            PolynomialFeatures(interaction_only=True),
            linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),
        )
    },
    "ridge": {"model": linear_model.RidgeCV()},
    "decision_tree": {
        "model": tree.DecisionTreeRegressor(max_depth=7, random_state=random_state)
    },
    "knn": {"model": neighbors.KNeighborsRegressor(n_neighbors=7, n_jobs=-1)},
    "random_forest": {
        "model": ensemble.RandomForestRegressor(
            max_depth=7, random_state=random_state, n_jobs=-1
        )
    },
}

Обучение и оценка моделей с помощью различных алгоритмов

In [77]:
for model_name in models.keys():
    print(f"Model: {model_name}")

    model = models[model_name]["model"]

    model_pipeline = Pipeline([("pipeline", pipeline_end), ("model", model)])
    model_pipeline = model_pipeline.fit(train_df, price_y_train.values.ravel())

    y_train_pred = model_pipeline.predict(train_df)
    y_test_pred = model_pipeline.predict(test_df)

    models[model_name]["fitted"] = model_pipeline
    models[model_name]["train_preds"] = y_train_pred
    models[model_name]["preds"] = y_test_pred
    models[model_name]["RMSE_train"] = math.sqrt(
        metrics.mean_squared_error(price_y_train, y_train_pred)
    )
    models[model_name]["RMSE_test"] = math.sqrt(
        metrics.mean_squared_error(price_y_test, y_test_pred)
    )
    models[model_name]["RMAE_test"] = math.sqrt(
        metrics.mean_absolute_error(price_y_test, y_test_pred)
    )
    models[model_name]["R2_test"] = metrics.r2_score(price_y_test, y_test_pred)
Model: linear
Model: linear_poly
Model: linear_interact
Model: ridge
Model: decision_tree
Model: knn
Model: random_forest

Вывод результатов оценки

In [78]:
reg_metrics = pd.DataFrame.from_dict(models, "index")[
    ["RMSE_train", "RMSE_test", "RMAE_test", "R2_test"]
]
reg_metrics.sort_values(by="RMSE_test").style.background_gradient(
    cmap="viridis", low=1, high=0.3, subset=["RMSE_train", "RMSE_test"]
).background_gradient(cmap="plasma", low=0.3, high=1, subset=["RMAE_test", "R2_test"])
Out[78]:
  RMSE_train RMSE_test RMAE_test R2_test
knn 5777.663053 6607.095563 67.047388 0.632511
random_forest 6567.406346 6852.474190 70.119860 0.604708
decision_tree 7022.066577 7393.444466 72.910390 0.539832
linear 8399.752941 8498.166215 80.676781 0.392042
ridge 8400.004465 8498.452033 80.683952 0.392001
linear_poly 6880.451269 3235067552304.437988 267308.588962 -88102783177967152.000000
linear_interact 7037.525048 113842510019087.921875 1576050.007127 -109101782409976135680.000000