MII_Salin_Oleg_PIbd-33/lec4_reg.ipynb at master

gg12 darfren f49b209552 done

2024-11-20 19:19:56 +04:00

51 KiB

Raw Permalink Blame History

Загрузка данных¶

In [2]:

import pandas as pd

from sklearn.preprocessing import OneHotEncoder
import numpy as np # type: ignore

from sklearn import set_config

set_config(transform_output="pandas")

random_state = 9

df = pd.read_csv("data/Medical_insurance.csv", index_col=False)

encoder = OneHotEncoder(sparse_output=False, drop="first")

encoded_values = encoder.fit_transform(df[["sex", "region", "smoker"]])

encoded_columns = encoder.get_feature_names_out(["sex", "region", "smoker"])

encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)

df = pd.concat([df, encoded_values_df], axis=1)

df = df.drop(["sex", "smoker", "region"], axis=1)

df

Out[2]:

	age	bmi	children	charges	sex_male	region_northwest	region_southeast	region_southwest	smoker_yes
0	19	27.900	0	16884.92400	0.0	0.0	0.0	1.0	1.0
1	18	33.770	1	1725.55230	1.0	0.0	1.0	0.0	0.0
2	28	33.000	3	4449.46200	1.0	0.0	1.0	0.0	0.0
3	33	22.705	0	21984.47061	1.0	1.0	0.0	0.0	0.0
4	32	28.880	0	3866.85520	1.0	1.0	0.0	0.0	0.0
...	...	...	...	...	...	...	...	...	...
2767	47	45.320	1	8569.86180	0.0	0.0	1.0	0.0	0.0
2768	21	34.600	0	2020.17700	0.0	0.0	0.0	1.0	0.0
2769	19	26.030	1	16450.89470	1.0	1.0	0.0	0.0	1.0
2770	23	18.715	0	21595.38229	1.0	1.0	0.0	0.0	0.0
2771	54	31.600	0	9850.43200	1.0	0.0	0.0	1.0	0.0

2772 rows × 9 columns

Формирование выборок¶

In [3]:

from utils import split_stratified_into_train_val_test

X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(
    df,
    stratify_colname="age",
    target_colname="charges",
    frac_train=0.80,
    frac_val=0,
    frac_test=0.20,
    random_state=random_state,
)

X_train = X_train.drop(["charges"], axis=1)
X_test = X_test.drop(["charges"], axis=1)

display("X_train", X_train)
display("y_train", y_train)

display("X_test", X_test)
display("y_test", y_test)

'X_train'

	age	bmi	children	sex_male	region_northwest	region_southeast	region_southwest	smoker_yes
2146	22	34.580	2	0.0	0.0	0.0	0.0	0.0
472	19	29.800	0	0.0	0.0	0.0	1.0	0.0
801	64	35.970	0	0.0	0.0	1.0	0.0	0.0
84	37	34.800	2	0.0	0.0	0.0	1.0	1.0
2028	61	33.915	0	1.0	0.0	0.0	0.0	0.0
...	...	...	...	...	...	...	...	...
979	36	29.920	0	0.0	0.0	1.0	0.0	0.0
1475	55	26.980	0	0.0	1.0	0.0	0.0	0.0
2547	34	42.130	2	1.0	0.0	1.0	0.0	0.0
2553	29	24.600	2	0.0	0.0	0.0	1.0	0.0
1974	61	35.910	0	0.0	0.0	0.0	0.0	0.0

2217 rows × 8 columns

'y_train'

	charges
2146	3925.75820
472	1744.46500
801	14313.84630
84	39836.51900
2028	13143.86485
...	...
979	4889.03680
1475	11082.57720
2547	5124.18870
2553	4529.47700
1974	13635.63790

2217 rows × 1 columns

'X_test'

	age	bmi	children	sex_male	region_northwest	region_southeast	region_southwest	smoker_yes
1101	53	28.600	3	1.0	0.0	0.0	1.0	0.0
2025	56	33.660	4	1.0	0.0	1.0	0.0	0.0
307	30	33.330	1	0.0	0.0	1.0	0.0	0.0
840	21	31.100	0	1.0	0.0	0.0	1.0	0.0
2090	47	29.545	1	0.0	1.0	0.0	0.0	0.0
...	...	...	...	...	...	...	...	...
1587	48	32.230	1	0.0	0.0	1.0	0.0	0.0
1177	40	27.400	1	0.0	0.0	0.0	1.0	0.0
1259	52	23.180	0	0.0	0.0	0.0	0.0	0.0
1291	19	34.900	0	1.0	0.0	0.0	1.0	1.0
2040	59	35.200	0	0.0	0.0	1.0	0.0	0.0

555 rows × 8 columns

'y_test'

	charges
1101	11253.42100
2025	12949.15540
307	4151.02870
840	1526.31200
2090	8930.93455
...	...
1587	8871.15170
1177	6496.88600
1259	10197.77220
1291	34828.65400
2040	12244.53100

555 rows × 1 columns

Определение перечня алгоритмов решения задачи аппроксимации (регрессии)¶

In [4]:

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model, tree, neighbors, ensemble, neural_network

random_state = 9

models = {
    "linear": {"model": linear_model.LinearRegression(n_jobs=-1)},
    "linear_poly": {
        "model": make_pipeline(
            PolynomialFeatures(degree=2),
            linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),
        )
    },
    "linear_interact": {
        "model": make_pipeline(
            PolynomialFeatures(interaction_only=True),
            linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),
        )
    },
    "ridge": {"model": linear_model.RidgeCV()},
    "decision_tree": {
        "model": tree.DecisionTreeRegressor(max_depth=7, random_state=random_state)
    },
    "knn": {"model": neighbors.KNeighborsRegressor(n_neighbors=7, n_jobs=-1)},
    "random_forest": {
        "model": ensemble.RandomForestRegressor(
            max_depth=7, random_state=random_state, n_jobs=-1
        )
    },
    "mlp": {
        "model": neural_network.MLPRegressor(
            activation="tanh",
            hidden_layer_sizes=(3,),
            max_iter=500,
            early_stopping=True,
            random_state=random_state,
        )
    },
}

Определение функции для стандартизации числовых значений для MLP¶

In [5]:

from pandas import DataFrame
from sklearn import preprocessing


stndart_scaler = preprocessing.StandardScaler()


def std_q(df: DataFrame) -> DataFrame:
    df["age"] = np.array(stndart_scaler.fit_transform(df["age"].to_numpy().reshape(-1, 1))).reshape(
        df["age"].shape
    )
    df["bmi"] = np.array(
        stndart_scaler.fit_transform(df["bmi"].to_numpy().reshape(-1, 1))
    ).reshape(df["bmi"].shape)
    df["children"] = np.array(
        stndart_scaler.fit_transform(df["children"].to_numpy().reshape(-1, 1))
    ).reshape(df["children"].shape)
    return df

Обучение и оценка моделей с помощью различных алгоритмов¶

In [6]:

import math
from pandas import DataFrame
from sklearn import metrics

for model_name in models.keys():
    print(f"Model: {model_name}")

    x_train: DataFrame = X_train.copy()
    x_test: DataFrame = X_test.copy()

    if model_name == "mlp":
        x_train = std_q(x_train)
        x_test = std_q(x_test)
        
    fitted_model = models[model_name]["model"].fit(
        x_train.values, y_train.values.ravel()
    )
    y_train_pred = fitted_model.predict(x_train.values)
    y_test_pred = fitted_model.predict(x_test.values)
    models[model_name]["fitted"] = fitted_model
    models[model_name]["train_preds"] = y_train_pred
    models[model_name]["preds"] = y_test_pred
    models[model_name]["RMSE_train"] = math.sqrt(
        metrics.mean_squared_error(y_train, y_train_pred)
    )
    models[model_name]["RMSE_test"] = math.sqrt(
        metrics.mean_squared_error(y_test, y_test_pred)
    )
    models[model_name]["RMAE_test"] = math.sqrt(
        metrics.mean_absolute_error(y_test, y_test_pred)
    )
    models[model_name]["R2_test"] = metrics.r2_score(y_test, y_test_pred)

Model: linear
Model: linear_poly
Model: linear_interact
Model: ridge
Model: decision_tree
Model: knn
Model: random_forest
Model: mlp

Вывод результатов оценки¶

In [ ]:

reg_metrics = pd.DataFrame.from_dict(models, "index")[
    ["RMSE_train", "RMSE_test", "RMAE_test", "R2_test"]
]
reg_metrics.sort_values(by="RMSE_test").style.background_gradient( # type: ignore
    cmap="viridis", low=1, high=0.3, subset=["RMSE_train", "RMSE_test"]
).background_gradient(cmap="plasma", low=0.3, high=1, subset=["RMAE_test", "R2_test"])

Out[ ]:

	RMSE_train	RMSE_test	RMAE_test	R2_test
random_forest	3221.469707	3953.661053	45.741609	0.901103
decision_tree	3643.279193	4288.040726	47.359073	0.883668
linear_poly	4731.024654	4868.817371	54.257745	0.850021
linear_interact	4776.393716	4938.699556	54.641209	0.845685
ridge	6028.427617	6216.544081	65.584948	0.755499
linear	6028.426993	6216.588829	65.580879	0.755496
knn	8230.959070	9715.102581	81.129201	0.402859
mlp	17848.198895	18518.275054	116.605174	-1.169619

Вывод реального и "спрогнозированного" результата для обучающей и тестовой выборок¶

Получение лучшей модели

In [ ]:

best_model = str(reg_metrics.sort_values(by="RMSE_test").iloc[0].name) # type: ignore

display(best_model)

'random_forest'

Вывод для обучающей выборки

In [12]:

pd.concat(
    [
        X_train,
        y_train,
        pd.Series(
            models[best_model]["train_preds"],
            index=y_train.index,
            name="ChargesPred",
        ),
    ],
    axis=1,
).head(5)

Out[12]:

	age	bmi	children	sex_male	region_southeast	region_southwest	smoker_yes	charges	ChargesPred
2146	22	34.580	2	0.0	0.0	0.0	0.0	3925.75820	4868.448184
472	19	29.800	0	0.0	0.0	1.0	0.0	1744.46500	3011.805771
801	64	35.970	0	0.0	1.0	0.0	0.0	14313.84630	13841.766282
84	37	34.800	2	0.0	0.0	1.0	1.0	39836.51900	39427.673528
2028	61	33.915	0	1.0	0.0	0.0	0.0	13143.86485	13575.291528

Вывод для тестовой выборки

In [13]:

pd.concat(
    [
        X_test,
        y_test,
        pd.Series(
            models[best_model]["preds"],
            index=y_test.index,
            name="ChargesPred",
        ),
    ],
    axis=1,
).head(5)

Out[13]:

	age	bmi	children	sex_male	region_northwest	region_southeast	region_southwest	charges	ChargesPred
1101	53	28.600	3	1.0	0.0	0.0	1.0	11253.42100	12139.772544
2025	56	33.660	4	1.0	0.0	1.0	0.0	12949.15540	14977.306757
307	30	33.330	1	0.0	0.0	1.0	0.0	4151.02870	5778.492115
840	21	31.100	0	1.0	0.0	0.0	1.0	1526.31200	3324.843009
2090	47	29.545	1	0.0	1.0	0.0	0.0	8930.93455	11318.629065

51 KiB Raw Permalink Blame History Unescape Escape