PIbd-33_Ivanova_S.V._MAI/lec4_reg.ipynb

28 KiB
Raw Blame History

Загрузка данных

In [146]:
import pandas as pd

density_train = pd.read_csv("data/density/density_train.csv", sep=";", decimal=",")
density_test = pd.read_csv("data/density/density_test.csv", sep=";", decimal=",")

display(density_train.head(3))
display(density_test.head(3))
T Al2O3 TiO2 Density
0 20 0.0 0.0 1.06250
1 25 0.0 0.0 1.05979
2 35 0.0 0.0 1.05404
T Al2O3 TiO2 Density
0 30 0.00 0.0 1.05696
1 55 0.00 0.0 1.04158
2 25 0.05 0.0 1.08438

Формирование выборок

In [147]:
density_y_train = density_train["Density"]
density_train = density_train.drop(["Density"], axis=1)

display(density_train.head(3))
display(density_y_train.head(3))

density_y_test = density_test["Density"]
density_test = density_test.drop(["Density"], axis=1)

display(density_test.head(3))
display(density_y_test.head(3))
T Al2O3 TiO2
0 20 0.0 0.0
1 25 0.0 0.0
2 35 0.0 0.0
0    1.06250
1    1.05979
2    1.05404
Name: Density, dtype: float64
T Al2O3 TiO2
0 30 0.00 0.0
1 55 0.00 0.0
2 25 0.05 0.0
0    1.05696
1    1.04158
2    1.08438
Name: Density, dtype: float64

Определение перечня алгоритмов решения задачи аппроксимации (регрессии)

In [148]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model, tree, neighbors, ensemble, neural_network

random_state = 9

models = {
    "linear": {"model": linear_model.LinearRegression(n_jobs=-1)},
    "linear_poly": {
        "model": make_pipeline(
            PolynomialFeatures(degree=2),
            linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),
        )
    },
    "linear_interact": {
        "model": make_pipeline(
            PolynomialFeatures(interaction_only=True),
            linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),
        )
    },
    "ridge": {"model": linear_model.RidgeCV()},
    "decision_tree": {
        "model": tree.DecisionTreeRegressor(max_depth=7, random_state=random_state)
    },
    "knn": {"model": neighbors.KNeighborsRegressor(n_neighbors=7, n_jobs=-1)},
    "random_forest": {
        "model": ensemble.RandomForestRegressor(
            max_depth=7, random_state=random_state, n_jobs=-1
        )
    },
    "mlp": {
        "model": neural_network.MLPRegressor(
            activation="tanh",
            hidden_layer_sizes=(3,),
            max_iter=500,
            early_stopping=True,
            random_state=random_state,
        )
    },
}

Определение функции для стандартизации значений в столбце "Температура" для MLP

In [149]:
from pandas import DataFrame
from sklearn import preprocessing

stndart_scaler = preprocessing.StandardScaler()

def std_temp(df: DataFrame) -> DataFrame:
    df["T"] = stndart_scaler.fit_transform(
        df["T"].to_numpy().reshape(-1, 1)
    ).reshape(df["T"].shape)
    return df

Обучение и оценка моделей с помощью различных алгоритмов

In [150]:
import math
from pandas import DataFrame
from sklearn import metrics

for model_name in models.keys():
    print(f"Model: {model_name}")
    X_train: DataFrame = density_train.copy()
    X_test: DataFrame = density_test.copy()

    if model_name == "mlp":
        X_train = std_temp(X_train)
        X_test = std_temp(X_test)

    fitted_model = models[model_name]["model"].fit(
        X_train.values, density_y_train.values.ravel()
    )
    y_train_pred = fitted_model.predict(X_train.values)
    y_test_pred = fitted_model.predict(X_test.values)
    models[model_name]["fitted"] = fitted_model
    models[model_name]["train_preds"] = y_train_pred
    models[model_name]["preds"] = y_test_pred
    models[model_name]["RMSE_train"] = math.sqrt(
        metrics.mean_squared_error(density_y_train, y_train_pred)
    )
    models[model_name]["RMSE_test"] = math.sqrt(
        metrics.mean_squared_error(density_y_test, y_test_pred)
    )
    models[model_name]["RMAE_test"] = math.sqrt(
        metrics.mean_absolute_error(density_y_test, y_test_pred)
    )
    models[model_name]["R2_test"] = metrics.r2_score(density_y_test, y_test_pred)
Model: linear
Model: linear_poly
Model: linear_interact
Model: ridge
Model: decision_tree
Model: knn
Model: random_forest
Model: mlp

Вывод результатов оценки

In [151]:
reg_metrics = pd.DataFrame.from_dict(models, "index")[
    ["RMSE_train", "RMSE_test", "RMAE_test", "R2_test"]
]
reg_metrics.sort_values(by="RMSE_test").style.background_gradient(
    cmap="viridis", low=1, high=0.3, subset=["RMSE_train", "RMSE_test"]
).background_gradient(cmap="plasma", low=0.3, high=1, subset=["RMAE_test", "R2_test"])
Out[151]:
  RMSE_train RMSE_test RMAE_test R2_test
linear_poly 0.000319 0.000362 0.016643 0.999965
linear_interact 0.001131 0.001491 0.033198 0.999413
linear 0.002464 0.003261 0.049891 0.997191
random_forest 0.002716 0.005575 0.067298 0.991788
decision_tree 0.000346 0.006433 0.076138 0.989067
ridge 0.013989 0.015356 0.116380 0.937703
knn 0.053108 0.056776 0.217611 0.148414
mlp 0.079478 0.067910 0.247692 -0.218339

Вывод реального и "спрогнозированного" результата для обучающей и тестовой выборок

Получение лучшей модели

In [152]:
best_model = str(reg_metrics.sort_values(by="RMSE_test").iloc[0].name)

display(best_model)
'linear_poly'

Вывод для обучающей выборки

In [153]:
pd.concat(
    [
        density_train,
        density_y_train,
        pd.Series(
            models[best_model]["train_preds"],
            index=density_y_train.index,
            name="DensityPred",
        ),
    ],
    axis=1,
).head(5)
Out[153]:
T Al2O3 TiO2 Density DensityPred
0 20 0.0 0.0 1.06250 1.063174
1 25 0.0 0.0 1.05979 1.060117
2 35 0.0 0.0 1.05404 1.053941
3 40 0.0 0.0 1.05103 1.050822
4 45 0.0 0.0 1.04794 1.047683

Вывод для тестовой выборки

In [154]:
pd.concat(
    [
        density_test,
        density_y_test,
        pd.Series(
            models[best_model]["preds"],
            index=density_y_test.index,
            name="DensityPred",
        ),
    ],
    axis=1,
).head(5)
Out[154]:
T Al2O3 TiO2 Density DensityPred
0 30 0.00 0.0 1.05696 1.057040
1 55 0.00 0.0 1.04158 1.041341
2 25 0.05 0.0 1.08438 1.084063
3 30 0.05 0.0 1.08112 1.080764
4 35 0.05 0.0 1.07781 1.077444