MII/lec4_reg.ipynb
2024-11-15 23:06:57 +04:00

129 KiB
Raw Blame History

Загрузка данных

In [94]:
import pandas as pd

from sklearn import set_config

set_config(transform_output="pandas")

random_state = 9

df = pd.read_csv("data/car_price_prediction.csv", index_col="ID")
df["distance_nokm"] = pd.to_numeric(df["Mileage"].str.replace(" km", "", regex=False))


df["distance_norm"] = df["distance_nokm"].clip(0, 350000)

df.boxplot(column="distance_norm")
df["Levy"] = pd.to_numeric(df["Levy"].str.replace("-", "0", regex=False))
# df["Cylinders"] = pd.to_numeric(df["Cylinders"].str.replace(".", "", regex=False))

# average_mileage = df["distance_norm"].mean()
# print(f"Среднее значение поля 'пробег': {average_mileage}")
# average_mileage = df["distance_norm"].mean()
# df["above_average_mileage"] = (df["distance_norm"] > average_mileage).astype(int)

df
Out[94]:
Price Levy Manufacturer Model Prod_year Category Leather_interior Fuel type Engine volume Mileage Cylinders Gear box type Drive wheels Doors Wheel Color Airbags distance_nokm distance_norm
ID
45654403 13328 1399 LEXUS RX 450 2010 Jeep Yes Hybrid 3.5 186005 km 6.0 Automatic 4x4 04-May Left wheel Silver 12 186005 186005
44731507 16621 1018 CHEVROLET Equinox 2011 Jeep No Petrol 3 192000 km 6.0 Tiptronic 4x4 04-May Left wheel Black 8 192000 192000
45774419 8467 0 HONDA FIT 2006 Hatchback No Petrol 1.3 200000 km 4.0 Variator Front 04-May Right-hand drive Black 2 200000 200000
45769185 3607 862 FORD Escape 2011 Jeep Yes Hybrid 2.5 168966 km 4.0 Automatic 4x4 04-May Left wheel White 0 168966 168966
45809263 11726 446 HONDA FIT 2014 Hatchback Yes Petrol 1.3 91901 km 4.0 Automatic Front 04-May Left wheel Silver 4 91901 91901
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
45798355 8467 0 MERCEDES-BENZ CLK 200 1999 Coupe Yes CNG 2.0 Turbo 300000 km 4.0 Manual Rear 02-Mar Left wheel Silver 5 300000 300000
45778856 15681 831 HYUNDAI Sonata 2011 Sedan Yes Petrol 2.4 161600 km 4.0 Tiptronic Front 04-May Left wheel Red 8 161600 161600
45804997 26108 836 HYUNDAI Tucson 2010 Jeep Yes Diesel 2 116365 km 4.0 Automatic Front 04-May Left wheel Grey 4 116365 116365
45793526 5331 1288 CHEVROLET Captiva 2007 Jeep Yes Diesel 2 51258 km 4.0 Automatic Front 04-May Left wheel Black 4 51258 51258
45813273 470 753 HYUNDAI Sonata 2012 Sedan Yes Hybrid 2.4 186923 km 4.0 Automatic Front 04-May Left wheel White 12 186923 186923

19237 rows × 19 columns

In [95]:
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

from transformers import TitanicFeatures


# columns_to_drop = ["Survived", "Name", "Cabin", "Ticket", "Embarked", "Parch", "Fare"]
columns_to_drop = [
    # "Price",
    "Doors",
    # "Color",
    # "Gear box type",
    # "Prod_year",
    "Mileage",
    # "Airbags",
    "Levy",
    # "Leather_interior",
    "Fuel type",
    "Drive wheels",
    "Engine volume",
    "Wheel",
    "distance_nokm",
    "Model",
     "Cylinders"
]
num_columns = [
    "Airbags",
    "distance_norm",
    # "Cylinders"
]
cat_columns = [
    "Color",
    "Gear box type",
    "Leather_interior",
    "Manufacturer",
    "Category"
]

num_imputer = SimpleImputer(strategy="median")
num_scaler = StandardScaler()
preprocessing_num = Pipeline(
    [
        ("imputer", num_imputer),
        ("scaler", num_scaler),
    ]
)

cat_imputer = SimpleImputer(strategy="constant", fill_value="unknown")
cat_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False, drop="first")
preprocessing_cat = Pipeline(
    [
        ("imputer", cat_imputer),
        ("encoder", cat_encoder),
    ]
)

features_preprocessing = ColumnTransformer(
    verbose_feature_names_out=False,
    transformers=[
        ("prepocessing_num", preprocessing_num, num_columns),
        ("prepocessing_cat", preprocessing_cat, cat_columns),
        #("prepocessing_features", cat_imputer, ["Name", "Cabin"]),
    ],
    remainder="passthrough"
)

# features_engineering = ColumnTransformer(
#     verbose_feature_names_out=False,
#     transformers=[
#         ("add_features", TitanicFeatures(), ["Name", "Cabin"]),
#     ],
#     remainder="passthrough",
# )

drop_columns = ColumnTransformer(
    verbose_feature_names_out=False,
    transformers=[
        ("drop_columns", "drop", columns_to_drop),
    ],
    remainder="passthrough",
)

# features_postprocessing = ColumnTransformer(
#     verbose_feature_names_out=False,
#     transformers=[
#         ("prepocessing_cat", preprocessing_cat, ["Cabin_type"]),
#     ],
#     remainder="passthrough",
# )

pipeline_end = Pipeline(
    [
        ("features_preprocessing", features_preprocessing),
       # ("features_engineering", features_engineering),
        ("drop_columns", drop_columns),
       # ("features_postprocessing", features_postprocessing),
    ]
)
In [96]:
preprocessing_result = pipeline_end.fit_transform(df)
df = pd.DataFrame(
    preprocessing_result,
    columns=pipeline_end.get_feature_names_out(),
)

Формирование выборок

In [97]:
from utils import split_stratified_into_train_val_test

X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(
    df,
    target_colname="Price",
    stratify_colname="Airbags",
    frac_train=0.80,
    frac_val=0,
    frac_test=0.20,
    random_state=random_state,
)

display("X_train", X_train)
display("y_train", y_train)

display("X_test", X_test)
display("y_test", y_test)
'X_train'
Airbags distance_norm Color_Black Color_Blue Color_Brown Color_Carnelian red Color_Golden Color_Green Color_Grey Color_Orange ... Category_Hatchback Category_Jeep Category_Limousine Category_Microbus Category_Minivan Category_Pickup Category_Sedan Category_Universal Price Prod_year
ID
45664431 -1.523737 0.012512 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1019 2013
45186186 1.254005 -0.768336 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 14113 2010
45757605 -0.134866 0.093264 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 15994 2010
45761763 -1.060780 -0.935158 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 24891 2011
45762505 1.254005 0.496836 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 706 2015
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
45738958 -0.134866 1.046858 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 7840 2005
45761444 1.254005 -0.288173 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 37633 2010
45809946 -0.597823 -0.567004 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 5209 2013
45789900 0.328091 -0.568642 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 14426 2006
45788440 -0.597823 0.037171 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 12544 2011

15389 rows × 97 columns

'y_train'
Price
ID
45664431 1019
45186186 14113
45757605 15994
45761763 24891
45762505 706
... ...
45738958 7840
45761444 37633
45809946 5209
45789900 14426
45788440 12544

15389 rows × 1 columns

'X_test'
Airbags distance_norm Color_Black Color_Blue Color_Brown Color_Carnelian red Color_Golden Color_Green Color_Grey Color_Orange ... Category_Hatchback Category_Jeep Category_Limousine Category_Microbus Category_Minivan Category_Pickup Category_Sedan Category_Universal Price Prod_year
ID
41976837 0.328091 -1.184551 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 90006 2015
45793567 -0.597823 -0.719669 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 7850 2009
45812786 -1.060780 1.832171 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2352 1998
45808317 1.254005 -0.453739 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1333 2015
45809653 -0.597823 -1.437871 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 62493 2018
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
45793612 -1.060780 -0.018923 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 51746 2014
45567237 -1.523737 0.242249 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 784 2014
45802935 -0.597823 0.299701 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 41221 2016
45102093 -0.597823 0.279496 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 10976 2003
45733630 -0.597823 -1.055951 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 55343 2018

3848 rows × 97 columns

'y_test'
Price
ID
41976837 90006
45793567 7850
45812786 2352
45808317 1333
45809653 62493
... ...
45793612 51746
45567237 784
45802935 41221
45102093 10976
45733630 55343

3848 rows × 1 columns

In [98]:
X_test
Out[98]:
Airbags distance_norm Color_Black Color_Blue Color_Brown Color_Carnelian red Color_Golden Color_Green Color_Grey Color_Orange ... Category_Hatchback Category_Jeep Category_Limousine Category_Microbus Category_Minivan Category_Pickup Category_Sedan Category_Universal Price Prod_year
ID
41976837 0.328091 -1.184551 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 90006 2015
45793567 -0.597823 -0.719669 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 7850 2009
45812786 -1.060780 1.832171 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2352 1998
45808317 1.254005 -0.453739 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1333 2015
45809653 -0.597823 -1.437871 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 62493 2018
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
45793612 -1.060780 -0.018923 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 51746 2014
45567237 -1.523737 0.242249 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 784 2014
45802935 -0.597823 0.299701 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 41221 2016
45102093 -0.597823 0.279496 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 10976 2003
45733630 -0.597823 -1.055951 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 55343 2018

3848 rows × 97 columns

In [99]:
X_train
Out[99]:
Airbags distance_norm Color_Black Color_Blue Color_Brown Color_Carnelian red Color_Golden Color_Green Color_Grey Color_Orange ... Category_Hatchback Category_Jeep Category_Limousine Category_Microbus Category_Minivan Category_Pickup Category_Sedan Category_Universal Price Prod_year
ID
45664431 -1.523737 0.012512 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1019 2013
45186186 1.254005 -0.768336 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 14113 2010
45757605 -0.134866 0.093264 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 15994 2010
45761763 -1.060780 -0.935158 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 24891 2011
45762505 1.254005 0.496836 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 706 2015
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
45738958 -0.134866 1.046858 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 7840 2005
45761444 1.254005 -0.288173 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 37633 2010
45809946 -0.597823 -0.567004 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 5209 2013
45789900 0.328091 -0.568642 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 14426 2006
45788440 -0.597823 0.037171 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 12544 2011

15389 rows × 97 columns

In [100]:
y_test
Out[100]:
Price
ID
41976837 90006
45793567 7850
45812786 2352
45808317 1333
45809653 62493
... ...
45793612 51746
45567237 784
45802935 41221
45102093 10976
45733630 55343

3848 rows × 1 columns

In [101]:
y_train
Out[101]:
Price
ID
45664431 1019
45186186 14113
45757605 15994
45761763 24891
45762505 706
... ...
45738958 7840
45761444 37633
45809946 5209
45789900 14426
45788440 12544

15389 rows × 1 columns

Определение перечня алгоритмов решения задачи аппроксимации (регрессии)

In [102]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model, tree, neighbors, ensemble, neural_network

random_state = 9

models = {
    # "mlp": {
    #     "model": neural_network.MLPRegressor(
    #         activation="tanh",
    #         hidden_layer_sizes=(3,),
    #         max_iter=500,
    #         early_stopping=True,
    #         random_state=random_state,
    #     )
    # },
    "linear": {"model": linear_model.LinearRegression(n_jobs=-1)},
    "linear_poly": {
        "model": make_pipeline(
            PolynomialFeatures(degree=2),
            linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),
        )
    },
    "linear_interact": {
        "model": make_pipeline(
            PolynomialFeatures(interaction_only=True),
            linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),
        )
    },
    "ridge": {"model": linear_model.RidgeCV()},
    "decision_tree": {
        "model": tree.DecisionTreeRegressor(max_depth=7, random_state=random_state)
    },
    "knn": {"model": neighbors.KNeighborsRegressor(n_neighbors=7, n_jobs=-1)},
    "random_forest": {
        "model": ensemble.RandomForestRegressor(
            max_depth=7, random_state=random_state, n_jobs=-1
        )
    },
}

Определение функции для стандартизации значений в столбце "Температура" для MLP

In [103]:
from pandas import DataFrame
from sklearn import preprocessing
import numpy as np

stndart_scaler = preprocessing.StandardScaler()


def std_temp(df: DataFrame) -> DataFrame:
    df["distance_norm"] = np.array(stndart_scaler.fit_transform(
        df["distance_norm"].reshape(-1, 1))
    ).reshape(df["distance_norm"].shape)
    return df


def std_temp2(df: DataFrame) -> DataFrame:
    # Преобразуем столбец в массив NumPy и применяем reshape
    df["distance_norm"] = stndart_scaler.fit_transform(
        df["distance_norm"].values.reshape(-1, 1)  # Изменяем на reshape(-1, 1)
    )
    return df


def std_temp3(df: DataFrame) -> DataFrame:
    # Проверка на NaN и заполнение средним значением
    df["distance_norm"].fillna(df["distance_norm"].mean(), inplace=True)

    # Преобразуем столбец в массив NumPy и применяем reshape
    df["distance_norm"] = stndart_scaler.fit_transform(
        df["distance_norm"].values.reshape(-1, 1)
    )
    return df
In [104]:
print(X_train.isnull().sum())
print(X_test.isnull().sum())
Airbags               0
distance_norm         0
Color_Black           0
Color_Blue            0
Color_Brown           0
                     ..
Category_Pickup       0
Category_Sedan        0
Category_Universal    0
Price                 0
Prod_year             0
Length: 97, dtype: int64
Airbags               0
distance_norm         0
Color_Black           0
Color_Blue            0
Color_Brown           0
                     ..
Category_Pickup       0
Category_Sedan        0
Category_Universal    0
Price                 0
Prod_year             0
Length: 97, dtype: int64

Обучение и оценка моделей с помощью различных алгоритмов

In [105]:
import math
from pandas import DataFrame
from sklearn import metrics





for model_name in models.keys():
    print(f"Model: {model_name}")
    X_train: DataFrame = X_train.copy()
    X_test: DataFrame = X_test.copy()

    if model_name == "mlp":
        X_train = std_temp(X_train)
        X_test = std_temp(X_test)

    fitted_model = models[model_name]["model"].fit(
        X_train.values, y_train.values.ravel()
    )
    y_train_pred = fitted_model.predict(X_train.values)
    y_test_pred = fitted_model.predict(X_test.values)
    models[model_name]["fitted"] = fitted_model
    models[model_name]["train_preds"] = y_train_pred
    models[model_name]["preds"] = y_test_pred
    models[model_name]["RMSE_train"] = math.sqrt(
        metrics.mean_squared_error(y_train, y_train_pred)
    )
    models[model_name]["RMSE_test"] = math.sqrt(
        metrics.mean_squared_error(y_test, y_test_pred)
    )
    models[model_name]["RMAE_test"] = math.sqrt(
        metrics.mean_absolute_error(y_test, y_test_pred)
    )
    models[model_name]["R2_test"] = metrics.r2_score(y_test, y_test_pred)
Model: linear
Model: linear_poly
Model: linear_interact
Model: ridge
Model: decision_tree
Model: knn
Model: random_forest

Вывод результатов оценки

In [106]:
reg_metrics = pd.DataFrame.from_dict(models, "index")[
    ["RMSE_train", "RMSE_test", "RMAE_test", "R2_test"]
]
reg_metrics.sort_values(by="RMSE_test").style.background_gradient(
    cmap="viridis", low=1, high=0.3, subset=["RMSE_train", "RMSE_test"]
).background_gradient(cmap="plasma", low=0.3, high=1, subset=["RMAE_test", "R2_test"])
Out[106]:
  RMSE_train RMSE_test RMAE_test R2_test
linear 0.000000 0.000000 0.000005 1.000000
linear_interact 0.000000 0.000000 0.000071 1.000000
linear_poly 0.000001 0.000069 0.001155 1.000000
ridge 676.783751 14251.537610 25.290145 0.998871
decision_tree 418.103239 410021.797952 82.474628 0.065532
random_forest 1051.981912 413832.680119 82.077377 0.048080
knn 4091.598286 417393.077890 82.148464 0.031630

Вывод реального и "спрогнозированного" результата для обучающей и тестовой выборок

Получение лучшей модели

In [107]:
best_model = str(reg_metrics.sort_values(by="RMSE_test").iloc[0].name)

display(best_model)
'linear'

Вывод для обучающей выборки

In [ ]:
pd.concat(
    [
        X_train,
        y_train,
        pd.Series(
            models[best_model]["train_preds"],
            index=y_train.index,
            name="PricePred",
        ),
    ],
    axis=1,
).head(5)
Out[ ]:
Airbags distance_norm Color_Black Color_Blue Color_Brown Color_Carnelian red Color_Golden Color_Green Color_Grey Color_Orange ... Category_Limousine Category_Microbus Category_Minivan Category_Pickup Category_Sedan Category_Universal Price Prod_year Price DensityPred
ID
45664431 -1.523737 0.012512 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 1019 2013 1019 1019.0
45186186 1.254005 -0.768336 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 1.0 0.0 14113 2010 14113 14113.0
45757605 -0.134866 0.093264 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 1.0 0.0 15994 2010 15994 15994.0
45761763 -1.060780 -0.935158 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 24891 2011 24891 24891.0
45762505 1.254005 0.496836 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 1.0 0.0 706 2015 706 706.0

5 rows × 99 columns

Вывод для тестовой выборки

In [ ]:
pd.concat(
    [
        X_test,
        y_test,
        pd.Series(
            models[best_model]["preds"],
            index=y_test.index,
            name="PricePred",
        ),
    ],
    axis=1,
).head(5)
Out[ ]:
Airbags distance_norm Color_Black Color_Blue Color_Brown Color_Carnelian red Color_Golden Color_Green Color_Grey Color_Orange ... Category_Limousine Category_Microbus Category_Minivan Category_Pickup Category_Sedan Category_Universal Price Prod_year Price DensityPred
ID
41976837 0.328091 -1.184551 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 90006 2015 90006 90006.0
45793567 -0.597823 -0.719669 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 7850 2009 7850 7850.0
45812786 -1.060780 1.832171 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 2352 1998 2352 2352.0
45808317 1.254005 -0.453739 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 1.0 0.0 1333 2015 1333 1333.0
45809653 -0.597823 -1.437871 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 62493 2018 62493 62493.0

5 rows × 99 columns