mai_pi-33_zakharov/lab4_pipeline.ipynb at 5a6a48e6222e750c7ed06a262647d77b49cf4f4d

Zakharov_Rostislav 75b0e0f580 feat(lab-4): r2 0,64

2024-12-12 23:48:52 +04:00

48 KiB

Raw Blame History

Загрузка набора данных¶

In [69]:

import pandas as pd
import numpy as np
import math

from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn import metrics
from sklearn import linear_model, tree, neighbors, ensemble

from sklearn.model_selection import train_test_split
from sklearn import set_config
from sklearn.pipeline import make_pipeline

from transformers import CarsFeatures


set_config(transform_output="pandas")

random_state = 9

df = pd.read_csv("../data/car-price-prediction.csv")

df = df.drop(columns=["Unnamed: 0"])

df

Out[69]:

	Price	Levy	Manufacturer	Model	Prod. year	Category	Leather interior	Fuel type	Engine volume	Mileage	Cylinders	Gear box type	Drive wheels	Doors	Wheel	Color	Airbags
0	13328	1399	LEXUS	RX 450	2010	Jeep	Yes	Hybrid	3.5	186005	6	Automatic	4x4	Четырехдверный	Left wheel	Silver	12
1	16621	1018	CHEVROLET	Equinox	2011	Jeep	No	Petrol	3.0	192000	6	Tiptronic	4x4	Четырехдверный	Left wheel	Black	8
2	8467	0	HONDA	FIT	2006	Hatchback	No	Petrol	1.3	200000	4	Variator	Front	Четырехдверный	Right-hand drive	Black	2
3	3607	862	FORD	Escape	2011	Jeep	Yes	Hybrid	2.5	168966	4	Automatic	4x4	Четырехдверный	Left wheel	White	0
4	11726	446	HONDA	FIT	2014	Hatchback	Yes	Petrol	1.3	91901	4	Automatic	Front	Четырехдверный	Left wheel	Silver	4
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
12592	8781	1107	OPEL	Combo	2007	Goods wagon	No	Diesel	1.7	236000	4	Manual	Front	Четырехдверный	Left wheel	Beige	4
12593	7840	0	NISSAN	Skyline	2003	Sedan	Yes	Petrol	3.0	220000	6	Tiptronic	Rear	Четырехдверный	Right-hand drive	White	0
12594	8467	0	MERCEDES-BENZ	CLK 200	1999	Coupe	Yes	CNG	2.0	300000	4	Manual	Rear	Двухдверный	Left wheel	Silver	5
12595	15681	831	HYUNDAI	Sonata	2011	Sedan	Yes	Petrol	2.4	161600	4	Tiptronic	Front	Четырехдверный	Left wheel	Red	8
12596	26108	836	HYUNDAI	Tucson	2010	Jeep	Yes	Diesel	2.0	116365	4	Automatic	Front	Четырехдверный	Left wheel	Grey	4

12597 rows × 17 columns

In [70]:

df.dtypes

Out[70]:

Price                 int64
Levy                  int64
Manufacturer         object
Model                object
Prod. year            int64
Category             object
Leather interior     object
Fuel type            object
Engine volume       float64
Mileage               int64
Cylinders             int64
Gear box type        object
Drive wheels         object
Doors                object
Wheel                object
Color                object
Airbags               int64
dtype: object

Разбиение на выборки¶

In [71]:

X = df
y = df["Category"]

train_df, test_df, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

print("Размеры выборок:")
print(f"Обучающая выборка: {train_df.shape[0]} записей")
print(train_df.Category.value_counts())
print(f"Тестовая выборка: {test_df.shape[0]} записей")
print(test_df.Category.value_counts())

Размеры выборок:
Обучающая выборка: 8817 записей
Category
Sedan          3954
Jeep           2263
Hatchback      1554
Minivan         312
Coupe           251
Universal       180
Microbus        143
Goods wagon     120
Pickup           22
Cabriolet        16
Limousine         2
Name: count, dtype: int64
Тестовая выборка: 3780 записей
Category
Sedan          1692
Jeep            990
Hatchback       636
Minivan         151
Coupe           117
Universal        82
Goods wagon      52
Microbus         46
Pickup            8
Cabriolet         5
Limousine         1
Name: count, dtype: int64

Oversampling¶

In [72]:

def oversample(df):
    X = df.drop("Category", axis=1)
    y = df["Category"]

    oversampler = RandomOverSampler(random_state=42)
    X_resampled, y_resampled = oversampler.fit_resample(X, y)  # type: ignore

    resampled_df = pd.concat([X_resampled, y_resampled], axis=1)
    return resampled_df


train_df_overs = oversample(train_df)
test_df_overs = oversample(test_df)

print("Размеры выборок:")
print(f"Обучающая выборка: {train_df_overs.shape[0]} записей")
print(train_df_overs.Category.value_counts())
print(f"Тестовая выборка: {test_df_overs.shape[0]} записей")
print(test_df_overs.Category.value_counts())

Размеры выборок:
Обучающая выборка: 43494 записей
Category
Sedan          3954
Jeep           3954
Universal      3954
Hatchback      3954
Coupe          3954
Goods wagon    3954
Minivan        3954
Microbus       3954
Pickup         3954
Limousine      3954
Cabriolet      3954
Name: count, dtype: int64
Тестовая выборка: 18612 записей
Category
Hatchback      1692
Sedan          1692
Universal      1692
Jeep           1692
Coupe          1692
Minivan        1692
Goods wagon    1692
Microbus       1692
Pickup         1692
Cabriolet      1692
Limousine      1692
Name: count, dtype: int64

In [73]:

price_y_train = train_df["Price"]
price_y_test = test_df["Price"]

Формирование конвейера для классификации данных¶

In [74]:

columns_to_drop = ["Price", "Color", "Model", "Manufacturer"]

num_columns = [
    column
    for column in df.columns
        if column not in columns_to_drop and df[column].dtype != "object"
]

cat_columns = [
    column
    for column in df.columns
        if column not in columns_to_drop and df[column].dtype == "object"
]

num_imputer = SimpleImputer(strategy="median")
num_scaler = StandardScaler()
preprocessing_num = Pipeline(
    [
        ("imputer", num_imputer),
        ("scaler", num_scaler),
    ]
)

cat_imputer = SimpleImputer(strategy="constant", fill_value="unknown")
cat_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False, drop="first")
preprocessing_cat = Pipeline(
    [
        ("imputer", cat_imputer),
        ("encoder", cat_encoder),
    ]
)

features_preprocessing = ColumnTransformer(
    verbose_feature_names_out=False,
    transformers=[
        ("prepocessing_num", preprocessing_num, num_columns),
        ("prepocessing_cat", preprocessing_cat, cat_columns),
        # ("prepocessing_features", num_imputer, ["Prod. year"]),
    ],
    remainder="passthrough",
)

features_engineering = ColumnTransformer(
    verbose_feature_names_out=False,
    transformers=[
        ("add_features", CarsFeatures(), ["Prod. year"]),
    ],
    remainder="passthrough",
)

drop_columns = ColumnTransformer(
    verbose_feature_names_out=False,
    transformers=[
        ("drop_columns", "drop", columns_to_drop),
    ],
    remainder="passthrough",
)

features_postprocessing = ColumnTransformer(
    verbose_feature_names_out=False,
    transformers=[
        ("prepocessing_num", preprocessing_num, ["Age"]),
    ],
    remainder="passthrough",
)


pipeline_end = Pipeline(
    [
        ("features_preprocessing", features_preprocessing),
        ("drop_columns", drop_columns),
    ]
)

Демонстрация работы конвейера для предобработки данных¶

In [75]:

preprocessing_result = pipeline_end.fit_transform(train_df)
preprocessed_df = pd.DataFrame(
    preprocessing_result,
    columns=pipeline_end.get_feature_names_out(),
)

preprocessed_df

Out[75]:

	Levy	Prod. year	Engine volume	Mileage	Cylinders	Airbags	Category_Coupe	Category_Goods wagon	Category_Hatchback	Category_Jeep	...	Fuel type_Petrol	Fuel type_Plug-in Hybrid	Gear box type_Manual	Gear box type_Tiptronic	Gear box type_Variator	Drive wheels_Front	Drive wheels_Rear	Doors_Многодверный	Doors_Четырехдверный	Wheel_Right-hand drive
10083	-1.192982	1.946936	-0.479341	-1.531744	-0.403213	-0.683755	0.0	0.0	0.0	0.0	...	1.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	1.0	0.0
9482	-1.192982	-0.879266	-0.887855	-0.130245	-0.403213	-1.190217	0.0	0.0	0.0	0.0	...	1.0	0.0	0.0	1.0	0.0	1.0	0.0	0.0	1.0	1.0
6177	0.081576	0.642535	-0.479341	-0.651122	-0.403213	0.835631	0.0	0.0	0.0	0.0	...	1.0	0.0	0.0	1.0	0.0	1.0	0.0	0.0	1.0	0.0
11756	-1.192982	-1.531466	0.473858	1.263152	-0.403213	-0.430524	0.0	0.0	0.0	1.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0
6557	1.703146	1.512135	0.473858	-0.739330	-0.403213	-0.683755	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	1.0	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
11964	-1.192982	-0.879266	0.337687	0.318018	1.538421	-0.683755	0.0	0.0	0.0	0.0	...	1.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0
5191	0.476602	0.859935	-0.206998	-0.011145	-0.403213	1.342092	0.0	0.0	0.0	0.0	...	1.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	1.0	0.0
5390	0.646834	-0.661866	-0.887855	1.155137	-0.403213	0.329169	0.0	0.0	1.0	0.0	...	0.0	0.0	0.0	0.0	1.0	1.0	0.0	0.0	1.0	0.0
860	-1.192982	1.077335	-0.751684	-0.697325	-0.403213	0.329169	0.0	0.0	0.0	0.0	...	1.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	1.0	0.0
7270	0.446048	0.425135	0.337687	-0.465093	7.363324	0.329169	0.0	0.0	0.0	0.0	...	1.0	0.0	0.0	1.0	0.0	1.0	0.0	0.0	1.0	0.0

8817 rows × 31 columns

Формирование набора моделей¶

In [76]:

models = {
    "linear": {"model": linear_model.LinearRegression(n_jobs=-1)},
    "linear_poly": {
        "model": make_pipeline(
            PolynomialFeatures(degree=2),
            linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),
        )
    },
    "linear_interact": {
        "model": make_pipeline(
            PolynomialFeatures(interaction_only=True),
            linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),
        )
    },
    "ridge": {"model": linear_model.RidgeCV()},
    "decision_tree": {
        "model": tree.DecisionTreeRegressor(max_depth=7, random_state=random_state)
    },
    "knn": {"model": neighbors.KNeighborsRegressor(n_neighbors=7, n_jobs=-1)},
    "random_forest": {
        "model": ensemble.RandomForestRegressor(
            max_depth=7, random_state=random_state, n_jobs=-1
        )
    },
}

Обучение и оценка моделей с помощью различных алгоритмов¶

In [77]:

for model_name in models.keys():
    print(f"Model: {model_name}")

    model = models[model_name]["model"]

    model_pipeline = Pipeline([("pipeline", pipeline_end), ("model", model)])
    model_pipeline = model_pipeline.fit(train_df, price_y_train.values.ravel())

    y_train_pred = model_pipeline.predict(train_df)
    y_test_pred = model_pipeline.predict(test_df)

    models[model_name]["fitted"] = model_pipeline
    models[model_name]["train_preds"] = y_train_pred
    models[model_name]["preds"] = y_test_pred
    models[model_name]["RMSE_train"] = math.sqrt(
        metrics.mean_squared_error(price_y_train, y_train_pred)
    )
    models[model_name]["RMSE_test"] = math.sqrt(
        metrics.mean_squared_error(price_y_test, y_test_pred)
    )
    models[model_name]["RMAE_test"] = math.sqrt(
        metrics.mean_absolute_error(price_y_test, y_test_pred)
    )
    models[model_name]["R2_test"] = metrics.r2_score(price_y_test, y_test_pred)

Model: linear
Model: linear_poly
Model: linear_interact
Model: ridge
Model: decision_tree
Model: knn
Model: random_forest

Вывод результатов оценки¶

In [78]:

reg_metrics = pd.DataFrame.from_dict(models, "index")[
    ["RMSE_train", "RMSE_test", "RMAE_test", "R2_test"]
]
reg_metrics.sort_values(by="RMSE_test").style.background_gradient(
    cmap="viridis", low=1, high=0.3, subset=["RMSE_train", "RMSE_test"]
).background_gradient(cmap="plasma", low=0.3, high=1, subset=["RMAE_test", "R2_test"])

Out[78]:

	RMSE_train	RMSE_test	RMAE_test	R2_test
knn	5777.663053	6607.095563	67.047388	0.632511
random_forest	6567.406346	6852.474190	70.119860	0.604708
decision_tree	7022.066577	7393.444466	72.910390	0.539832
linear	8399.752941	8498.166215	80.676781	0.392042
ridge	8400.004465	8498.452033	80.683952	0.392001
linear_poly	6880.451269	3235067552304.437988	267308.588962	-88102783177967152.000000
linear_interact	7037.525048	113842510019087.921875	1576050.007127	-109101782409976135680.000000

48 KiB Raw Blame History Unescape Escape

Загрузка набора данных¶

Разбиение на выборки¶

Oversampling¶

Формирование конвейера для классификации данных¶

Демонстрация работы конвейера для предобработки данных¶

Формирование набора моделей¶

Обучение и оценка моделей с помощью различных алгоритмов¶

Вывод результатов оценки¶

48 KiB

Raw Blame History