28 KiB
28 KiB
Загрузка данных¶
In [146]:
import pandas as pd
density_train = pd.read_csv("data/density/density_train.csv", sep=";", decimal=",")
density_test = pd.read_csv("data/density/density_test.csv", sep=";", decimal=",")
display(density_train.head(3))
display(density_test.head(3))
Формирование выборок¶
In [147]:
density_y_train = density_train["Density"]
density_train = density_train.drop(["Density"], axis=1)
display(density_train.head(3))
display(density_y_train.head(3))
density_y_test = density_test["Density"]
density_test = density_test.drop(["Density"], axis=1)
display(density_test.head(3))
display(density_y_test.head(3))
Определение перечня алгоритмов решения задачи аппроксимации (регрессии)¶
In [148]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model, tree, neighbors, ensemble, neural_network
random_state = 9
models = {
"linear": {"model": linear_model.LinearRegression(n_jobs=-1)},
"linear_poly": {
"model": make_pipeline(
PolynomialFeatures(degree=2),
linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),
)
},
"linear_interact": {
"model": make_pipeline(
PolynomialFeatures(interaction_only=True),
linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),
)
},
"ridge": {"model": linear_model.RidgeCV()},
"decision_tree": {
"model": tree.DecisionTreeRegressor(max_depth=7, random_state=random_state)
},
"knn": {"model": neighbors.KNeighborsRegressor(n_neighbors=7, n_jobs=-1)},
"random_forest": {
"model": ensemble.RandomForestRegressor(
max_depth=7, random_state=random_state, n_jobs=-1
)
},
"mlp": {
"model": neural_network.MLPRegressor(
activation="tanh",
hidden_layer_sizes=(3,),
max_iter=500,
early_stopping=True,
random_state=random_state,
)
},
}
Определение функции для стандартизации значений в столбце "Температура" для MLP¶
In [149]:
from pandas import DataFrame
from sklearn import preprocessing
stndart_scaler = preprocessing.StandardScaler()
def std_temp(df: DataFrame) -> DataFrame:
df["T"] = stndart_scaler.fit_transform(
df["T"].to_numpy().reshape(-1, 1)
).reshape(df["T"].shape)
return df
Обучение и оценка моделей с помощью различных алгоритмов¶
In [150]:
import math
from pandas import DataFrame
from sklearn import metrics
for model_name in models.keys():
print(f"Model: {model_name}")
X_train: DataFrame = density_train.copy()
X_test: DataFrame = density_test.copy()
if model_name == "mlp":
X_train = std_temp(X_train)
X_test = std_temp(X_test)
fitted_model = models[model_name]["model"].fit(
X_train.values, density_y_train.values.ravel()
)
y_train_pred = fitted_model.predict(X_train.values)
y_test_pred = fitted_model.predict(X_test.values)
models[model_name]["fitted"] = fitted_model
models[model_name]["train_preds"] = y_train_pred
models[model_name]["preds"] = y_test_pred
models[model_name]["RMSE_train"] = math.sqrt(
metrics.mean_squared_error(density_y_train, y_train_pred)
)
models[model_name]["RMSE_test"] = math.sqrt(
metrics.mean_squared_error(density_y_test, y_test_pred)
)
models[model_name]["RMAE_test"] = math.sqrt(
metrics.mean_absolute_error(density_y_test, y_test_pred)
)
models[model_name]["R2_test"] = metrics.r2_score(density_y_test, y_test_pred)
Вывод результатов оценки¶
In [151]:
reg_metrics = pd.DataFrame.from_dict(models, "index")[
["RMSE_train", "RMSE_test", "RMAE_test", "R2_test"]
]
reg_metrics.sort_values(by="RMSE_test").style.background_gradient(
cmap="viridis", low=1, high=0.3, subset=["RMSE_train", "RMSE_test"]
).background_gradient(cmap="plasma", low=0.3, high=1, subset=["RMAE_test", "R2_test"])
Out[151]:
Вывод реального и "спрогнозированного" результата для обучающей и тестовой выборок¶
Получение лучшей модели
In [152]:
best_model = str(reg_metrics.sort_values(by="RMSE_test").iloc[0].name)
display(best_model)
Вывод для обучающей выборки
In [153]:
pd.concat(
[
density_train,
density_y_train,
pd.Series(
models[best_model]["train_preds"],
index=density_y_train.index,
name="DensityPred",
),
],
axis=1,
).head(5)
Out[153]:
Вывод для тестовой выборки
In [154]:
pd.concat(
[
density_test,
density_y_test,
pd.Series(
models[best_model]["preds"],
index=density_y_test.index,
name="DensityPred",
),
],
axis=1,
).head(5)
Out[154]: