51 KiB
51 KiB
Загрузка данных¶
In [2]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np # type: ignore
from sklearn import set_config
set_config(transform_output="pandas")
random_state = 9
df = pd.read_csv("data/Medical_insurance.csv", index_col=False)
encoder = OneHotEncoder(sparse_output=False, drop="first")
encoded_values = encoder.fit_transform(df[["sex", "region", "smoker"]])
encoded_columns = encoder.get_feature_names_out(["sex", "region", "smoker"])
encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)
df = pd.concat([df, encoded_values_df], axis=1)
df = df.drop(["sex", "smoker", "region"], axis=1)
df
Out[2]:
Формирование выборок¶
In [3]:
from utils import split_stratified_into_train_val_test
X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(
df,
stratify_colname="age",
target_colname="charges",
frac_train=0.80,
frac_val=0,
frac_test=0.20,
random_state=random_state,
)
X_train = X_train.drop(["charges"], axis=1)
X_test = X_test.drop(["charges"], axis=1)
display("X_train", X_train)
display("y_train", y_train)
display("X_test", X_test)
display("y_test", y_test)
Определение перечня алгоритмов решения задачи аппроксимации (регрессии)¶
In [4]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model, tree, neighbors, ensemble, neural_network
random_state = 9
models = {
"linear": {"model": linear_model.LinearRegression(n_jobs=-1)},
"linear_poly": {
"model": make_pipeline(
PolynomialFeatures(degree=2),
linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),
)
},
"linear_interact": {
"model": make_pipeline(
PolynomialFeatures(interaction_only=True),
linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),
)
},
"ridge": {"model": linear_model.RidgeCV()},
"decision_tree": {
"model": tree.DecisionTreeRegressor(max_depth=7, random_state=random_state)
},
"knn": {"model": neighbors.KNeighborsRegressor(n_neighbors=7, n_jobs=-1)},
"random_forest": {
"model": ensemble.RandomForestRegressor(
max_depth=7, random_state=random_state, n_jobs=-1
)
},
"mlp": {
"model": neural_network.MLPRegressor(
activation="tanh",
hidden_layer_sizes=(3,),
max_iter=500,
early_stopping=True,
random_state=random_state,
)
},
}
Определение функции для стандартизации числовых значений для MLP¶
In [5]:
from pandas import DataFrame
from sklearn import preprocessing
stndart_scaler = preprocessing.StandardScaler()
def std_q(df: DataFrame) -> DataFrame:
df["age"] = np.array(stndart_scaler.fit_transform(df["age"].to_numpy().reshape(-1, 1))).reshape(
df["age"].shape
)
df["bmi"] = np.array(
stndart_scaler.fit_transform(df["bmi"].to_numpy().reshape(-1, 1))
).reshape(df["bmi"].shape)
df["children"] = np.array(
stndart_scaler.fit_transform(df["children"].to_numpy().reshape(-1, 1))
).reshape(df["children"].shape)
return df
Обучение и оценка моделей с помощью различных алгоритмов¶
In [6]:
import math
from pandas import DataFrame
from sklearn import metrics
for model_name in models.keys():
print(f"Model: {model_name}")
x_train: DataFrame = X_train.copy()
x_test: DataFrame = X_test.copy()
if model_name == "mlp":
x_train = std_q(x_train)
x_test = std_q(x_test)
fitted_model = models[model_name]["model"].fit(
x_train.values, y_train.values.ravel()
)
y_train_pred = fitted_model.predict(x_train.values)
y_test_pred = fitted_model.predict(x_test.values)
models[model_name]["fitted"] = fitted_model
models[model_name]["train_preds"] = y_train_pred
models[model_name]["preds"] = y_test_pred
models[model_name]["RMSE_train"] = math.sqrt(
metrics.mean_squared_error(y_train, y_train_pred)
)
models[model_name]["RMSE_test"] = math.sqrt(
metrics.mean_squared_error(y_test, y_test_pred)
)
models[model_name]["RMAE_test"] = math.sqrt(
metrics.mean_absolute_error(y_test, y_test_pred)
)
models[model_name]["R2_test"] = metrics.r2_score(y_test, y_test_pred)
Вывод результатов оценки¶
In [ ]:
reg_metrics = pd.DataFrame.from_dict(models, "index")[
["RMSE_train", "RMSE_test", "RMAE_test", "R2_test"]
]
reg_metrics.sort_values(by="RMSE_test").style.background_gradient( # type: ignore
cmap="viridis", low=1, high=0.3, subset=["RMSE_train", "RMSE_test"]
).background_gradient(cmap="plasma", low=0.3, high=1, subset=["RMAE_test", "R2_test"])
Out[ ]:
Вывод реального и "спрогнозированного" результата для обучающей и тестовой выборок¶
Получение лучшей модели
In [ ]:
best_model = str(reg_metrics.sort_values(by="RMSE_test").iloc[0].name) # type: ignore
display(best_model)
Вывод для обучающей выборки
In [12]:
pd.concat(
[
X_train,
y_train,
pd.Series(
models[best_model]["train_preds"],
index=y_train.index,
name="ChargesPred",
),
],
axis=1,
).head(5)
Out[12]:
Вывод для тестовой выборки
In [13]:
pd.concat(
[
X_test,
y_test,
pd.Series(
models[best_model]["preds"],
index=y_test.index,
name="ChargesPred",
),
],
axis=1,
).head(5)
Out[13]: