MII_Salin_Oleg_PIbd-33/lec4_reg.ipynb
gg12 darfren f49b209552 done
2024-11-20 19:19:56 +04:00

51 KiB
Raw Blame History

Загрузка данных

In [2]:
import pandas as pd

from sklearn.preprocessing import OneHotEncoder
import numpy as np # type: ignore

from sklearn import set_config

set_config(transform_output="pandas")

random_state = 9

df = pd.read_csv("data/Medical_insurance.csv", index_col=False)

encoder = OneHotEncoder(sparse_output=False, drop="first")

encoded_values = encoder.fit_transform(df[["sex", "region", "smoker"]])

encoded_columns = encoder.get_feature_names_out(["sex", "region", "smoker"])

encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)

df = pd.concat([df, encoded_values_df], axis=1)

df = df.drop(["sex", "smoker", "region"], axis=1)

df
Out[2]:
age bmi children charges sex_male region_northwest region_southeast region_southwest smoker_yes
0 19 27.900 0 16884.92400 0.0 0.0 0.0 1.0 1.0
1 18 33.770 1 1725.55230 1.0 0.0 1.0 0.0 0.0
2 28 33.000 3 4449.46200 1.0 0.0 1.0 0.0 0.0
3 33 22.705 0 21984.47061 1.0 1.0 0.0 0.0 0.0
4 32 28.880 0 3866.85520 1.0 1.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ... ...
2767 47 45.320 1 8569.86180 0.0 0.0 1.0 0.0 0.0
2768 21 34.600 0 2020.17700 0.0 0.0 0.0 1.0 0.0
2769 19 26.030 1 16450.89470 1.0 1.0 0.0 0.0 1.0
2770 23 18.715 0 21595.38229 1.0 1.0 0.0 0.0 0.0
2771 54 31.600 0 9850.43200 1.0 0.0 0.0 1.0 0.0

2772 rows × 9 columns

Формирование выборок

In [3]:
from utils import split_stratified_into_train_val_test

X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(
    df,
    stratify_colname="age",
    target_colname="charges",
    frac_train=0.80,
    frac_val=0,
    frac_test=0.20,
    random_state=random_state,
)

X_train = X_train.drop(["charges"], axis=1)
X_test = X_test.drop(["charges"], axis=1)

display("X_train", X_train)
display("y_train", y_train)

display("X_test", X_test)
display("y_test", y_test)
'X_train'
age bmi children sex_male region_northwest region_southeast region_southwest smoker_yes
2146 22 34.580 2 0.0 0.0 0.0 0.0 0.0
472 19 29.800 0 0.0 0.0 0.0 1.0 0.0
801 64 35.970 0 0.0 0.0 1.0 0.0 0.0
84 37 34.800 2 0.0 0.0 0.0 1.0 1.0
2028 61 33.915 0 1.0 0.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ...
979 36 29.920 0 0.0 0.0 1.0 0.0 0.0
1475 55 26.980 0 0.0 1.0 0.0 0.0 0.0
2547 34 42.130 2 1.0 0.0 1.0 0.0 0.0
2553 29 24.600 2 0.0 0.0 0.0 1.0 0.0
1974 61 35.910 0 0.0 0.0 0.0 0.0 0.0

2217 rows × 8 columns

'y_train'
charges
2146 3925.75820
472 1744.46500
801 14313.84630
84 39836.51900
2028 13143.86485
... ...
979 4889.03680
1475 11082.57720
2547 5124.18870
2553 4529.47700
1974 13635.63790

2217 rows × 1 columns

'X_test'
age bmi children sex_male region_northwest region_southeast region_southwest smoker_yes
1101 53 28.600 3 1.0 0.0 0.0 1.0 0.0
2025 56 33.660 4 1.0 0.0 1.0 0.0 0.0
307 30 33.330 1 0.0 0.0 1.0 0.0 0.0
840 21 31.100 0 1.0 0.0 0.0 1.0 0.0
2090 47 29.545 1 0.0 1.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ...
1587 48 32.230 1 0.0 0.0 1.0 0.0 0.0
1177 40 27.400 1 0.0 0.0 0.0 1.0 0.0
1259 52 23.180 0 0.0 0.0 0.0 0.0 0.0
1291 19 34.900 0 1.0 0.0 0.0 1.0 1.0
2040 59 35.200 0 0.0 0.0 1.0 0.0 0.0

555 rows × 8 columns

'y_test'
charges
1101 11253.42100
2025 12949.15540
307 4151.02870
840 1526.31200
2090 8930.93455
... ...
1587 8871.15170
1177 6496.88600
1259 10197.77220
1291 34828.65400
2040 12244.53100

555 rows × 1 columns

Определение перечня алгоритмов решения задачи аппроксимации (регрессии)

In [4]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model, tree, neighbors, ensemble, neural_network

random_state = 9

models = {
    "linear": {"model": linear_model.LinearRegression(n_jobs=-1)},
    "linear_poly": {
        "model": make_pipeline(
            PolynomialFeatures(degree=2),
            linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),
        )
    },
    "linear_interact": {
        "model": make_pipeline(
            PolynomialFeatures(interaction_only=True),
            linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),
        )
    },
    "ridge": {"model": linear_model.RidgeCV()},
    "decision_tree": {
        "model": tree.DecisionTreeRegressor(max_depth=7, random_state=random_state)
    },
    "knn": {"model": neighbors.KNeighborsRegressor(n_neighbors=7, n_jobs=-1)},
    "random_forest": {
        "model": ensemble.RandomForestRegressor(
            max_depth=7, random_state=random_state, n_jobs=-1
        )
    },
    "mlp": {
        "model": neural_network.MLPRegressor(
            activation="tanh",
            hidden_layer_sizes=(3,),
            max_iter=500,
            early_stopping=True,
            random_state=random_state,
        )
    },
}

Определение функции для стандартизации числовых значений для MLP

In [5]:
from pandas import DataFrame
from sklearn import preprocessing


stndart_scaler = preprocessing.StandardScaler()


def std_q(df: DataFrame) -> DataFrame:
    df["age"] = np.array(stndart_scaler.fit_transform(df["age"].to_numpy().reshape(-1, 1))).reshape(
        df["age"].shape
    )
    df["bmi"] = np.array(
        stndart_scaler.fit_transform(df["bmi"].to_numpy().reshape(-1, 1))
    ).reshape(df["bmi"].shape)
    df["children"] = np.array(
        stndart_scaler.fit_transform(df["children"].to_numpy().reshape(-1, 1))
    ).reshape(df["children"].shape)
    return df

Обучение и оценка моделей с помощью различных алгоритмов

In [6]:
import math
from pandas import DataFrame
from sklearn import metrics

for model_name in models.keys():
    print(f"Model: {model_name}")

    x_train: DataFrame = X_train.copy()
    x_test: DataFrame = X_test.copy()

    if model_name == "mlp":
        x_train = std_q(x_train)
        x_test = std_q(x_test)
        
    fitted_model = models[model_name]["model"].fit(
        x_train.values, y_train.values.ravel()
    )
    y_train_pred = fitted_model.predict(x_train.values)
    y_test_pred = fitted_model.predict(x_test.values)
    models[model_name]["fitted"] = fitted_model
    models[model_name]["train_preds"] = y_train_pred
    models[model_name]["preds"] = y_test_pred
    models[model_name]["RMSE_train"] = math.sqrt(
        metrics.mean_squared_error(y_train, y_train_pred)
    )
    models[model_name]["RMSE_test"] = math.sqrt(
        metrics.mean_squared_error(y_test, y_test_pred)
    )
    models[model_name]["RMAE_test"] = math.sqrt(
        metrics.mean_absolute_error(y_test, y_test_pred)
    )
    models[model_name]["R2_test"] = metrics.r2_score(y_test, y_test_pred)
Model: linear
Model: linear_poly
Model: linear_interact
Model: ridge
Model: decision_tree
Model: knn
Model: random_forest
Model: mlp

Вывод результатов оценки

In [ ]:
reg_metrics = pd.DataFrame.from_dict(models, "index")[
    ["RMSE_train", "RMSE_test", "RMAE_test", "R2_test"]
]
reg_metrics.sort_values(by="RMSE_test").style.background_gradient( # type: ignore
    cmap="viridis", low=1, high=0.3, subset=["RMSE_train", "RMSE_test"]
).background_gradient(cmap="plasma", low=0.3, high=1, subset=["RMAE_test", "R2_test"])
Out[ ]:
  RMSE_train RMSE_test RMAE_test R2_test
random_forest 3221.469707 3953.661053 45.741609 0.901103
decision_tree 3643.279193 4288.040726 47.359073 0.883668
linear_poly 4731.024654 4868.817371 54.257745 0.850021
linear_interact 4776.393716 4938.699556 54.641209 0.845685
ridge 6028.427617 6216.544081 65.584948 0.755499
linear 6028.426993 6216.588829 65.580879 0.755496
knn 8230.959070 9715.102581 81.129201 0.402859
mlp 17848.198895 18518.275054 116.605174 -1.169619

Вывод реального и "спрогнозированного" результата для обучающей и тестовой выборок

Получение лучшей модели

In [ ]:
best_model = str(reg_metrics.sort_values(by="RMSE_test").iloc[0].name) # type: ignore

display(best_model)
'random_forest'

Вывод для обучающей выборки

In [12]:
pd.concat(
    [
        X_train,
        y_train,
        pd.Series(
            models[best_model]["train_preds"],
            index=y_train.index,
            name="ChargesPred",
        ),
    ],
    axis=1,
).head(5)
Out[12]:
age bmi children sex_male region_northwest region_southeast region_southwest smoker_yes charges ChargesPred
2146 22 34.580 2 0.0 0.0 0.0 0.0 0.0 3925.75820 4868.448184
472 19 29.800 0 0.0 0.0 0.0 1.0 0.0 1744.46500 3011.805771
801 64 35.970 0 0.0 0.0 1.0 0.0 0.0 14313.84630 13841.766282
84 37 34.800 2 0.0 0.0 0.0 1.0 1.0 39836.51900 39427.673528
2028 61 33.915 0 1.0 0.0 0.0 0.0 0.0 13143.86485 13575.291528

Вывод для тестовой выборки

In [13]:
pd.concat(
    [
        X_test,
        y_test,
        pd.Series(
            models[best_model]["preds"],
            index=y_test.index,
            name="ChargesPred",
        ),
    ],
    axis=1,
).head(5)
Out[13]:
age bmi children sex_male region_northwest region_southeast region_southwest smoker_yes charges ChargesPred
1101 53 28.600 3 1.0 0.0 0.0 1.0 0.0 11253.42100 12139.772544
2025 56 33.660 4 1.0 0.0 1.0 0.0 0.0 12949.15540 14977.306757
307 30 33.330 1 0.0 0.0 1.0 0.0 0.0 4151.02870 5778.492115
840 21 31.100 0 1.0 0.0 0.0 1.0 0.0 1526.31200 3324.843009
2090 47 29.545 1 0.0 1.0 0.0 0.0 0.0 8930.93455 11318.629065