MII/lec4.ipynb
2024-11-15 23:06:57 +04:00

250 KiB
Raw Blame History

Загрузка набора данных

In [1]:
import pandas as pd

from sklearn import set_config

set_config(transform_output="pandas")

random_state=9

df = pd.read_csv("data/car_price_prediction.csv", index_col="ID")
df['distance_nokm'] = pd.to_numeric(df['Mileage'].str.replace(' km', '', regex=False))


df["distance_norm"] = df["distance_nokm"].clip(0, 350000)

df.boxplot(column="distance_norm")

average_mileage = df["distance_norm"].mean()
print(f"Среднее значение поля 'пробег': {average_mileage}")
average_mileage = df["distance_norm"].mean()
df["above_average_mileage"] = (df["distance_norm"] > average_mileage).astype(int)

df
Среднее значение поля 'пробег': 136686.7442428653
Out[1]:
Price Levy Manufacturer Model Prod_year Category Leather_interior Fuel type Engine volume Mileage Cylinders Gear box type Drive wheels Doors Wheel Color Airbags distance_nokm distance_norm above_average_mileage
ID
45654403 13328 1399 LEXUS RX 450 2010 Jeep Yes Hybrid 3.5 186005 km 6.0 Automatic 4x4 04-May Left wheel Silver 12 186005 186005 1
44731507 16621 1018 CHEVROLET Equinox 2011 Jeep No Petrol 3 192000 km 6.0 Tiptronic 4x4 04-May Left wheel Black 8 192000 192000 1
45774419 8467 - HONDA FIT 2006 Hatchback No Petrol 1.3 200000 km 4.0 Variator Front 04-May Right-hand drive Black 2 200000 200000 1
45769185 3607 862 FORD Escape 2011 Jeep Yes Hybrid 2.5 168966 km 4.0 Automatic 4x4 04-May Left wheel White 0 168966 168966 1
45809263 11726 446 HONDA FIT 2014 Hatchback Yes Petrol 1.3 91901 km 4.0 Automatic Front 04-May Left wheel Silver 4 91901 91901 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
45798355 8467 - MERCEDES-BENZ CLK 200 1999 Coupe Yes CNG 2.0 Turbo 300000 km 4.0 Manual Rear 02-Mar Left wheel Silver 5 300000 300000 1
45778856 15681 831 HYUNDAI Sonata 2011 Sedan Yes Petrol 2.4 161600 km 4.0 Tiptronic Front 04-May Left wheel Red 8 161600 161600 1
45804997 26108 836 HYUNDAI Tucson 2010 Jeep Yes Diesel 2 116365 km 4.0 Automatic Front 04-May Left wheel Grey 4 116365 116365 0
45793526 5331 1288 CHEVROLET Captiva 2007 Jeep Yes Diesel 2 51258 km 4.0 Automatic Front 04-May Left wheel Black 4 51258 51258 0
45813273 470 753 HYUNDAI Sonata 2012 Sedan Yes Hybrid 2.4 186923 km 4.0 Automatic Front 04-May Left wheel White 12 186923 186923 1

19237 rows × 20 columns

No description has been provided for this image

Разделение набора данных на обучающую и тестовые выборки (80/20) для задачи классификации

Целевой признак -- gear box type - коробка переключения передач. x - полная выборка, y - gear box столбец

In [2]:
from utils import split_stratified_into_train_val_test

X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(
    df, stratify_colname="above_average_mileage", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=random_state
)

display("X_train", X_train)
display("y_train", y_train)

display("X_test", X_test)
display("y_test", y_test)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[2], line 3
      1 from utils import split_stratified_into_train_val_test
----> 3 X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(
      4     df, stratify_colname="above_average_mileage", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=random_state
      5 )
      7 display("X_train", X_train)
      8 display("y_train", y_train)

File c:\Users\1\Desktop\улгту\3 курс\МИИ\mai\utils.py:56, in split_stratified_into_train_val_test(df_input, target_colname, stratify_colname, frac_train, frac_val, frac_test, random_state)
     53     raise ValueError("%s is not a column in the dataframe" % (stratify_colname))
     55 if target_colname not in df_input.columns:
---> 56     raise ValueError("%s is not a column in the dataframe" % (target_colname))
     58 X = df_input  # Contains all columns.
     59 y = df_input[[target_colname]]  # Dataframe of just the column on which to stratify.

ValueError: z is not a column in the dataframe

В итоге, этот код выполняет следующие действия:

  • Заполняет пропущенные значения: В числовых столбцах медианой, в категориальных - значением "unknown".
  • Стандартизирует числовые данные: приводит их к нулевому среднему и единичному стандартному отклонению.
  • Преобразует категориальные данные: использует one-hot-кодирование.
  • Удаляет ненужные столбцы: из списка columns_to_drop.

Формирование конвейера для классификации данных

preprocessing_num -- конвейер для обработки числовых данных: заполнение пропущенных значений и стандартизация

preprocessing_cat -- конвейер для обработки категориальных данных: заполнение пропущенных данных и унитарное кодирование

features_preprocessing -- трансформер для предобработки признаков

features_engineering -- трансформер для конструирования признаков

drop_columns -- трансформер для удаления колонок

features_postprocessing -- трансформер для унитарного кодирования новых признаков

pipeline_end -- основной конвейер предобработки данных и конструирования признаков

Конвейер выполняется последовательно.

Трансформер выполняет параллельно для указанного набора колонок.

Документация:

https://scikit-learn.org/1.5/api/sklearn.pipeline.html

https://scikit-learn.org/1.5/modules/generated/sklearn.compose.ColumnTransformer.html#sklearn.compose.ColumnTransformer

In [ ]:
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

from transformers import TitanicFeatures


# columns_to_drop = ["Survived", "Name", "Cabin", "Ticket", "Embarked", "Parch", "Fare"]
columns_to_drop = [
    "Doors",
    # "Color",
    # "Gear box type",
    "Prod_year",
    "Mileage",
    # "Airbags",
    "Levy",
    # "Leather_interior",
    "Fuel type",
    "Drive wheels",
    "Engine volume",
    "Wheel",
    "distance_nokm",
    "Model",
    "Cylinders"
]
num_columns = [
    "Airbags",
    "distance_norm"
]
cat_columns = [
    "Color",
    "Gear box type",
    "Leather_interior",
    "Manufacturer",
    "Category"
]

num_imputer = SimpleImputer(strategy="median")
num_scaler = StandardScaler()
preprocessing_num = Pipeline(
    [
        ("imputer", num_imputer),
        ("scaler", num_scaler),
    ]
)

cat_imputer = SimpleImputer(strategy="constant", fill_value="unknown")
cat_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False, drop="first")
preprocessing_cat = Pipeline(
    [
        ("imputer", cat_imputer),
        ("encoder", cat_encoder),
    ]
)

features_preprocessing = ColumnTransformer(
    verbose_feature_names_out=False,
    transformers=[
        ("prepocessing_num", preprocessing_num, num_columns),
        ("prepocessing_cat", preprocessing_cat, cat_columns),
        #("prepocessing_features", cat_imputer, ["Name", "Cabin"]),
    ],
    remainder="passthrough"
)

# features_engineering = ColumnTransformer(
#     verbose_feature_names_out=False,
#     transformers=[
#         ("add_features", TitanicFeatures(), ["Name", "Cabin"]),
#     ],
#     remainder="passthrough",
# )

drop_columns = ColumnTransformer(
    verbose_feature_names_out=False,
    transformers=[
        ("drop_columns", "drop", columns_to_drop),
    ],
    remainder="passthrough",
)

# features_postprocessing = ColumnTransformer(
#     verbose_feature_names_out=False,
#     transformers=[
#         ("prepocessing_cat", preprocessing_cat, ["Cabin_type"]),
#     ],
#     remainder="passthrough",
# )

pipeline_end = Pipeline(
    [
        ("features_preprocessing", features_preprocessing),
       # ("features_engineering", features_engineering),
        ("drop_columns", drop_columns),
       # ("features_postprocessing", features_postprocessing),
    ]
)

Демонстрация работы конвейера для предобработки данных при классификации

In [ ]:
preprocessing_result = pipeline_end.fit_transform(X_train)
preprocessed_df = pd.DataFrame(
    preprocessing_result,
    columns=pipeline_end.get_feature_names_out(),
)

preprocessed_df
Out[ ]:
Airbags distance_norm Color_Black Color_Blue Color_Brown Color_Carnelian red Color_Golden Color_Green Color_Grey Color_Orange ... Category_Hatchback Category_Jeep Category_Limousine Category_Microbus Category_Minivan Category_Pickup Category_Sedan Category_Universal Price above_average_mileage
ID
45556857 1.257079 -0.632902 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 82166 0
45771273 1.257079 1.347620 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 282 1
45790387 -1.519311 0.684031 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 ... 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 18817 1
45759169 -0.593848 -0.732507 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 14113 0
45721228 -0.593848 -1.528630 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 30 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
45755768 -1.287945 0.341202 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 5645 1
45803089 -0.593848 -0.663681 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 13799 0
45809088 -0.593848 -0.300038 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 12074 0
45816478 -0.131116 -0.431363 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 10192 0
45767026 1.719811 -0.794133 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 17249 0

15389 rows × 96 columns

Формирование набора моделей для классификации

logistic -- логистическая регрессия

ridge -- гребневая регрессия

decision_tree -- дерево решений

knn -- k-ближайших соседей

naive_bayes -- наивный Байесовский классификатор

gradient_boosting -- метод градиентного бустинга (набор деревьев решений)

random_forest -- метод случайного леса (набор деревьев решений)

mlp -- многослойный персептрон (нейронная сеть)

Документация: https://scikit-learn.org/1.5/supervised_learning.html

In [ ]:
from sklearn import ensemble, linear_model, naive_bayes, neighbors, neural_network, tree

class_models = {
    "logistic": {"model": linear_model.LogisticRegression()},
    # "ridge": {"model": linear_model.RidgeClassifierCV(cv=5, class_weight="balanced")},
    "ridge": {"model": linear_model.LogisticRegression(penalty="l2", class_weight="balanced")},
    "decision_tree": {
        "model": tree.DecisionTreeClassifier(max_depth=7, random_state=random_state)
    },
    "knn": {"model": neighbors.KNeighborsClassifier(n_neighbors=7)},
    "naive_bayes": {"model": naive_bayes.GaussianNB()},
    "gradient_boosting": {
        "model": ensemble.GradientBoostingClassifier(n_estimators=210)
    },
    "random_forest": {
        "model": ensemble.RandomForestClassifier(
            max_depth=11, class_weight="balanced", random_state=random_state
        )
    },
    "mlp": {
        "model": neural_network.MLPClassifier(
            hidden_layer_sizes=(7,),
            max_iter=100000,
            early_stopping=True,
            random_state=random_state,
        )
    },
}
In [ ]:
# print(y_train.dtypes)
# print(y_test.dtypes)
# df.info()
print(y_train.head())
print(y_test.head())
          above_average_mileage
ID                             
45556857                      0
45771273                      1
45790387                      1
45759169                      0
45721228                      0
          above_average_mileage
ID                             
45812690                      0
45795399                      0
45787355                      0
45798288                      0
45769626                      1

Обучение моделей на обучающем наборе данных и оценка на тестовом

In [ ]:
import numpy as np
from sklearn import metrics

for model_name in class_models.keys():
    print(f"Model: {model_name}")
    model = class_models[model_name]["model"]

    model_pipeline = Pipeline([("pipeline", pipeline_end), ("model", model)])
    model_pipeline = model_pipeline.fit(X_train, y_train.values.ravel())

    y_train_predict = model_pipeline.predict(X_train)
    y_test_probs = model_pipeline.predict_proba(X_test)[:, 1]
    y_test_predict = np.where(y_test_probs > 0.5, 1, 0)

    class_models[model_name]["pipeline"] = model_pipeline
    class_models[model_name]["probs"] = y_test_probs
    class_models[model_name]["preds"] = y_test_predict

    print(y_train_predict)
    print(y_test_predict)
    class_models[model_name]["Precision_train"] = metrics.precision_score(
        y_train, y_train_predict, average="micro"
    )
    class_models[model_name]["Precision_test"] = metrics.precision_score(
        y_test, y_test_predict, average="micro"
    )
    class_models[model_name]["Recall_train"] = metrics.recall_score(
        y_train, y_train_predict, average="micro"
    )
    class_models[model_name]["Recall_test"] = metrics.recall_score(
        y_test, y_test_predict
    )
    class_models[model_name]["Accuracy_train"] = metrics.accuracy_score(
        y_train, y_train_predict
    )
    class_models[model_name]["Accuracy_test"] = metrics.accuracy_score(
        y_test, y_test_predict
    )
    class_models[model_name]["ROC_AUC_test"] = metrics.roc_auc_score(
        y_test, y_test_probs
    )
    class_models[model_name]["F1_train"] = metrics.f1_score(y_train, y_train_predict)
    class_models[model_name]["F1_test"] = metrics.f1_score(y_test, y_test_predict)
    class_models[model_name]["MCC_test"] = metrics.matthews_corrcoef(
        y_test, y_test_predict
    )
    class_models[model_name]["Cohen_kappa_test"] = metrics.cohen_kappa_score(
        y_test, y_test_predict
    )
    class_models[model_name]["Confusion_matrix"] = metrics.confusion_matrix(
        y_test, y_test_predict
    )
Model: logistic
c:\Users\1\Desktop\улгту\3 курс\МИИ\mai\.venv\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
c:\Users\1\Desktop\улгту\3 курс\МИИ\mai\.venv\Lib\site-packages\sklearn\preprocessing\_encoders.py:242: UserWarning: Found unknown categories in columns [3] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
[0 1 1 ... 0 0 0]
[0 0 0 ... 0 1 0]
Model: ridge
c:\Users\1\Desktop\улгту\3 курс\МИИ\mai\.venv\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
c:\Users\1\Desktop\улгту\3 курс\МИИ\mai\.venv\Lib\site-packages\sklearn\preprocessing\_encoders.py:242: UserWarning: Found unknown categories in columns [3] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
c:\Users\1\Desktop\улгту\3 курс\МИИ\mai\.venv\Lib\site-packages\sklearn\preprocessing\_encoders.py:242: UserWarning: Found unknown categories in columns [3] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
[0 1 1 ... 0 0 0]
[0 0 0 ... 0 1 0]
Model: decision_tree
[0 1 1 ... 0 0 0]
[0 0 0 ... 0 1 0]
Model: knn
c:\Users\1\Desktop\улгту\3 курс\МИИ\mai\.venv\Lib\site-packages\sklearn\preprocessing\_encoders.py:242: UserWarning: Found unknown categories in columns [3] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
[0 1 1 ... 0 0 0]
[0 0 0 ... 1 1 1]
Model: naive_bayes
c:\Users\1\Desktop\улгту\3 курс\МИИ\mai\.venv\Lib\site-packages\sklearn\preprocessing\_encoders.py:242: UserWarning: Found unknown categories in columns [3] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
[0 1 1 ... 1 1 1]
[1 1 1 ... 1 1 1]
Model: gradient_boosting
c:\Users\1\Desktop\улгту\3 курс\МИИ\mai\.venv\Lib\site-packages\sklearn\preprocessing\_encoders.py:242: UserWarning: Found unknown categories in columns [3] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
[0 1 1 ... 0 0 0]
[0 0 0 ... 0 1 0]
Model: random_forest
c:\Users\1\Desktop\улгту\3 курс\МИИ\mai\.venv\Lib\site-packages\sklearn\preprocessing\_encoders.py:242: UserWarning: Found unknown categories in columns [3] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
[0 1 1 ... 0 0 0]
[0 0 0 ... 0 1 0]
Model: mlp
[0 1 0 ... 1 1 0]
[1 1 0 ... 1 0 1]
c:\Users\1\Desktop\улгту\3 курс\МИИ\mai\.venv\Lib\site-packages\sklearn\preprocessing\_encoders.py:242: UserWarning: Found unknown categories in columns [3] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(

Сводная таблица оценок качества для использованных моделей классификации

Документация: https://scikit-learn.org/1.5/modules/model_evaluation.html

Матрица неточностей

In [ ]:
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt

_, ax = plt.subplots(int(len(class_models) / 2), 2, figsize=(12, 10), sharex=False, sharey=False)
for index, key in enumerate(class_models.keys()):
    c_matrix = class_models[key]["Confusion_matrix"]
    disp = ConfusionMatrixDisplay(
        confusion_matrix=c_matrix, display_labels=["Less", "More"]
    ).plot(ax=ax.flat[index])
    disp.ax_.set_title(key)

plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.1)
plt.show()
No description has been provided for this image
No description has been provided for this image

Точность, полнота, верность (аккуратность), F-мера

In [ ]:
class_metrics = pd.DataFrame.from_dict(class_models, "index")[
    [
        "Precision_train",
        "Precision_test",
        "Recall_train",
        "Recall_test",
        "Accuracy_train",
        "Accuracy_test",
        "F1_train",
        "F1_test",
    ]
]
class_metrics.sort_values(
    by="Accuracy_test", ascending=False
).style.background_gradient(
    cmap="plasma",
    low=0.3,
    high=1,
    subset=["Accuracy_train", "Accuracy_test", "F1_train", "F1_test"],
).background_gradient(
    cmap="viridis",
    low=1,
    high=0.3,
    subset=[
        "Precision_train",
        "Precision_test",
        "Recall_train",
        "Recall_test",
    ],
)
Out[ ]:
  Precision_train Precision_test Recall_train Recall_test Accuracy_train Accuracy_test F1_train F1_test
logistic 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
ridge 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
decision_tree 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
gradient_boosting 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
random_forest 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
knn 0.929040 0.897089 0.929040 0.841681 0.929040 0.897089 0.918871 0.880723
mlp 0.621873 0.611486 0.621873 0.549799 0.621873 0.611486 0.574417 0.560940
naive_bayes 0.499903 0.505457 0.499903 0.978123 0.499903 0.505457 0.637118 0.641011

ROC-кривая, каппа Коэна, коэффициент корреляции Мэтьюса

In [ ]:
class_metrics = pd.DataFrame.from_dict(class_models, "index")[
    [
        "Accuracy_test",
        "F1_test",
        "ROC_AUC_test",
        "Cohen_kappa_test",
        "MCC_test",
    ]
]
class_metrics.sort_values(by="ROC_AUC_test", ascending=False).style.background_gradient(
    cmap="plasma",
    low=0.3,
    high=1,
    subset=[
        "ROC_AUC_test",
        "MCC_test",
        "Cohen_kappa_test",
    ],
).background_gradient(
    cmap="viridis",
    low=1,
    high=0.3,
    subset=[
        "Accuracy_test",
        "F1_test",
    ],
)
Out[ ]:
  Accuracy_test F1_test ROC_AUC_test Cohen_kappa_test MCC_test
logistic 1.000000 1.000000 1.000000 1.000000 1.000000
ridge 1.000000 1.000000 1.000000 1.000000 1.000000
decision_tree 1.000000 1.000000 1.000000 1.000000 1.000000
gradient_boosting 1.000000 1.000000 1.000000 1.000000 1.000000
random_forest 1.000000 1.000000 1.000000 1.000000 1.000000
knn 0.897089 0.880723 0.959929 0.790571 0.793206
naive_bayes 0.505457 0.641011 0.686884 0.086588 0.180162
mlp 0.611486 0.560940 0.666638 0.212793 0.212933
In [ ]:
best_model = str(class_metrics.sort_values(by="MCC_test", ascending=False).iloc[0].name)

display(best_model)
'logistic'

Вывод данных с ошибкой предсказания для оценки

In [ ]:
preprocessing_result = pipeline_end.transform(X_test)
preprocessed_df = pd.DataFrame(
    preprocessing_result,
    columns=pipeline_end.get_feature_names_out(),
)

y_pred = class_models[best_model]["preds"]

error_index = y_test[y_test["above_average_mileage"] != y_pred].index.tolist()
display(f"Error items count: {len(error_index)}")

error_predicted = pd.Series(y_pred, index=y_test.index).loc[error_index]
error_df = X_test.loc[error_index].copy()
error_df.insert(loc=1, column="Predicted", value=error_predicted)
error_df.sort_index()
c:\Users\1\Desktop\улгту\3 курс\МИИ\mai\.venv\Lib\site-packages\sklearn\preprocessing\_encoders.py:242: UserWarning: Found unknown categories in columns [3] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(
'Error items count: 0'
Out[ ]:
Price Predicted Levy Manufacturer Model Prod_year Category Leather_interior Fuel type Engine volume ... Cylinders Gear box type Drive wheels Doors Wheel Color Airbags distance_nokm distance_norm above_average_mileage
ID

0 rows × 21 columns

Пример использования обученной модели (конвейера) для предсказания

In [ ]:
model = class_models[best_model]["pipeline"]

example_id = 45812690
test = pd.DataFrame(X_test.loc[example_id, :]).T
test_preprocessed = pd.DataFrame(preprocessed_df.loc[example_id, :]).T
display(test)
display(test_preprocessed)
result_proba = model.predict_proba(test)[0]
result = model.predict(test)[0]
real = int(y_test.loc[example_id].values[0])
display(f"predicted: {result} (proba: {result_proba})")
display(f"real: {real}")
Price Levy Manufacturer Model Prod_year Category Leather_interior Fuel type Engine volume Mileage Cylinders Gear box type Drive wheels Doors Wheel Color Airbags distance_nokm distance_norm above_average_mileage
45812690 6899 645 HONDA Civic 2011 Sedan Yes Petrol 1.8 106000 km 4.0 Automatic Front 04-May Left wheel Black 8 106000 106000 0
Airbags distance_norm Color_Black Color_Blue Color_Brown Color_Carnelian red Color_Golden Color_Green Color_Grey Color_Orange ... Category_Hatchback Category_Jeep Category_Limousine Category_Microbus Category_Minivan Category_Pickup Category_Sedan Category_Universal Price above_average_mileage
45812690 0.331616 -0.34179 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 6899.0 0.0

1 rows × 96 columns

'predicted: 0 (proba: [0.99824886 0.00175114])'
'real: 0'
In [ ]:
from sklearn.model_selection import GridSearchCV

optimized_model_type = "random_forest"

random_forest_model = class_models[optimized_model_type]["pipeline"]

param_grid = {
    "model__n_estimators": [10, 20, 30, 40, 50, 100, 150, 200, 250, 500],
    "model__max_features": ["sqrt", "log2", 2],
    "model__max_depth": [2, 3, 4, 5, 6, 7, 8, 9 ,10],
    "model__criterion": ["gini", "entropy", "log_loss"],
}

gs_optomizer = GridSearchCV(
    estimator=random_forest_model, param_grid=param_grid, n_jobs=-1
)
gs_optomizer.fit(X_train, y_train.values.ravel())
gs_optomizer.best_params_

Обучение модели с новыми гиперпараметрами

In [ ]:
optimized_model = ensemble.RandomForestClassifier(
    random_state=random_state,
    criterion="gini",
    max_depth=7,
    max_features="sqrt",
    n_estimators=30,
)

result = {}

result["pipeline"] = Pipeline([("pipeline", pipeline_end), ("model", optimized_model)]).fit(X_train, y_train.values.ravel())
result["train_preds"] = result["pipeline"].predict(X_train)
result["probs"] = result["pipeline"].predict_proba(X_test)[:, 1]
result["preds"] = np.where(result["probs"] > 0.5, 1, 0)

result["Precision_train"] = metrics.precision_score(y_train, result["train_preds"])
result["Precision_test"] = metrics.precision_score(y_test, result["preds"])
result["Recall_train"] = metrics.recall_score(y_train, result["train_preds"])
result["Recall_test"] = metrics.recall_score(y_test, result["preds"])
result["Accuracy_train"] = metrics.accuracy_score(y_train, result["train_preds"])
result["Accuracy_test"] = metrics.accuracy_score(y_test, result["preds"])
result["ROC_AUC_test"] = metrics.roc_auc_score(y_test, result["probs"])
result["F1_train"] = metrics.f1_score(y_train, result["train_preds"])
result["F1_test"] = metrics.f1_score(y_test, result["preds"])
result["MCC_test"] = metrics.matthews_corrcoef(y_test, result["preds"])
result["Cohen_kappa_test"] = metrics.cohen_kappa_score(y_test, result["preds"])
result["Confusion_matrix"] = metrics.confusion_matrix(y_test, result["preds"])

Формирование данных для оценки старой и новой версии модели

In [ ]:
optimized_metrics = pd.DataFrame(columns=list(result.keys()))
optimized_metrics.loc[len(optimized_metrics)] = pd.Series(
    data=class_models[optimized_model_type]
)
optimized_metrics.loc[len(optimized_metrics)] = pd.Series(
    data=result
)
optimized_metrics.insert(loc=0, column="Name", value=["Old", "New"])
optimized_metrics = optimized_metrics.set_index("Name")

Оценка параметров старой и новой модели

In [ ]:
optimized_metrics[
    [
        "Precision_train",
        "Precision_test",
        "Recall_train",
        "Recall_test",
        "Accuracy_train",
        "Accuracy_test",
        "F1_train",
        "F1_test",
    ]
].style.background_gradient(
    cmap="plasma",
    low=0.3,
    high=1,
    subset=["Accuracy_train", "Accuracy_test", "F1_train", "F1_test"],
).background_gradient(
    cmap="viridis",
    low=1,
    high=0.3,
    subset=[
        "Precision_train",
        "Precision_test",
        "Recall_train",
        "Recall_test",
    ],
)
In [ ]:
optimized_metrics[
    [
        "Accuracy_test",
        "F1_test",
        "ROC_AUC_test",
        "Cohen_kappa_test",
        "MCC_test",
    ]
].style.background_gradient(
    cmap="plasma",
    low=0.3,
    high=1,
    subset=[
        "ROC_AUC_test",
        "MCC_test",
        "Cohen_kappa_test",
    ],
).background_gradient(
    cmap="viridis",
    low=1,
    high=0.3,
    subset=[
        "Accuracy_test",
        "F1_test",
    ],
)
In [ ]:
_, ax = plt.subplots(1, 2, figsize=(10, 4), sharex=False, sharey=False
)

for index in range(0, len(optimized_metrics)):
    c_matrix = optimized_metrics.iloc[index]["Confusion_matrix"]
    disp = ConfusionMatrixDisplay(
        confusion_matrix=c_matrix, display_labels=["Died", "Sirvived"]
    ).plot(ax=ax.flat[index])

plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.3)
plt.show()