{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "#### Загрузка данных" ] }, { "cell_type": "code", "execution_count": 146, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TAl2O3TiO2Density
0200.00.01.06250
1250.00.01.05979
2350.00.01.05404
\n", "
" ], "text/plain": [ " T Al2O3 TiO2 Density\n", "0 20 0.0 0.0 1.06250\n", "1 25 0.0 0.0 1.05979\n", "2 35 0.0 0.0 1.05404" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TAl2O3TiO2Density
0300.000.01.05696
1550.000.01.04158
2250.050.01.08438
\n", "
" ], "text/plain": [ " T Al2O3 TiO2 Density\n", "0 30 0.00 0.0 1.05696\n", "1 55 0.00 0.0 1.04158\n", "2 25 0.05 0.0 1.08438" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import pandas as pd\n", "\n", "density_train = pd.read_csv(\"data/density/density_train.csv\", sep=\";\", decimal=\",\")\n", "density_test = pd.read_csv(\"data/density/density_test.csv\", sep=\";\", decimal=\",\")\n", "\n", "display(density_train.head(3))\n", "display(density_test.head(3))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Формирование выборок" ] }, { "cell_type": "code", "execution_count": 147, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TAl2O3TiO2
0200.00.0
1250.00.0
2350.00.0
\n", "
" ], "text/plain": [ " T Al2O3 TiO2\n", "0 20 0.0 0.0\n", "1 25 0.0 0.0\n", "2 35 0.0 0.0" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "0 1.06250\n", "1 1.05979\n", "2 1.05404\n", "Name: Density, dtype: float64" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TAl2O3TiO2
0300.000.0
1550.000.0
2250.050.0
\n", "
" ], "text/plain": [ " T Al2O3 TiO2\n", "0 30 0.00 0.0\n", "1 55 0.00 0.0\n", "2 25 0.05 0.0" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "0 1.05696\n", "1 1.04158\n", "2 1.08438\n", "Name: Density, dtype: float64" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "density_y_train = density_train[\"Density\"]\n", "density_train = density_train.drop([\"Density\"], axis=1)\n", "\n", "display(density_train.head(3))\n", "display(density_y_train.head(3))\n", "\n", "density_y_test = density_test[\"Density\"]\n", "density_test = density_test.drop([\"Density\"], axis=1)\n", "\n", "display(density_test.head(3))\n", "display(density_y_test.head(3))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Определение перечня алгоритмов решения задачи аппроксимации (регрессии)" ] }, { "cell_type": "code", "execution_count": 148, "metadata": {}, "outputs": [], "source": [ "from sklearn.pipeline import make_pipeline\n", "from sklearn.preprocessing import PolynomialFeatures\n", "from sklearn import linear_model, tree, neighbors, ensemble, neural_network\n", "\n", "random_state = 9\n", "\n", "models = {\n", " \"linear\": {\"model\": linear_model.LinearRegression(n_jobs=-1)},\n", " \"linear_poly\": {\n", " \"model\": make_pipeline(\n", " PolynomialFeatures(degree=2),\n", " linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),\n", " )\n", " },\n", " \"linear_interact\": {\n", " \"model\": make_pipeline(\n", " PolynomialFeatures(interaction_only=True),\n", " linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),\n", " )\n", " },\n", " \"ridge\": {\"model\": linear_model.RidgeCV()},\n", " \"decision_tree\": {\n", " \"model\": tree.DecisionTreeRegressor(max_depth=7, random_state=random_state)\n", " },\n", " \"knn\": {\"model\": neighbors.KNeighborsRegressor(n_neighbors=7, n_jobs=-1)},\n", " \"random_forest\": {\n", " \"model\": ensemble.RandomForestRegressor(\n", " max_depth=7, random_state=random_state, n_jobs=-1\n", " )\n", " },\n", " \"mlp\": {\n", " \"model\": neural_network.MLPRegressor(\n", " activation=\"tanh\",\n", " hidden_layer_sizes=(3,),\n", " max_iter=500,\n", " early_stopping=True,\n", " random_state=random_state,\n", " )\n", " },\n", "}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Определение функции для стандартизации значений в столбце \"Температура\" для MLP" ] }, { "cell_type": "code", "execution_count": 149, "metadata": {}, "outputs": [], "source": [ "from pandas import DataFrame\n", "from sklearn import preprocessing\n", "\n", "stndart_scaler = preprocessing.StandardScaler()\n", "\n", "def std_temp(df: DataFrame) -> DataFrame:\n", " df[\"T\"] = stndart_scaler.fit_transform(\n", " df[\"T\"].to_numpy().reshape(-1, 1)\n", " ).reshape(df[\"T\"].shape)\n", " return df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Обучение и оценка моделей с помощью различных алгоритмов" ] }, { "cell_type": "code", "execution_count": 150, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model: linear\n", "Model: linear_poly\n", "Model: linear_interact\n", "Model: ridge\n", "Model: decision_tree\n", "Model: knn\n", "Model: random_forest\n", "Model: mlp\n" ] } ], "source": [ "import math\n", "from pandas import DataFrame\n", "from sklearn import metrics\n", "\n", "for model_name in models.keys():\n", " print(f\"Model: {model_name}\")\n", " X_train: DataFrame = density_train.copy()\n", " X_test: DataFrame = density_test.copy()\n", "\n", " if model_name == \"mlp\":\n", " X_train = std_temp(X_train)\n", " X_test = std_temp(X_test)\n", "\n", " fitted_model = models[model_name][\"model\"].fit(\n", " X_train.values, density_y_train.values.ravel()\n", " )\n", " y_train_pred = fitted_model.predict(X_train.values)\n", " y_test_pred = fitted_model.predict(X_test.values)\n", " models[model_name][\"fitted\"] = fitted_model\n", " models[model_name][\"train_preds\"] = y_train_pred\n", " models[model_name][\"preds\"] = y_test_pred\n", " models[model_name][\"RMSE_train\"] = math.sqrt(\n", " metrics.mean_squared_error(density_y_train, y_train_pred)\n", " )\n", " models[model_name][\"RMSE_test\"] = math.sqrt(\n", " metrics.mean_squared_error(density_y_test, y_test_pred)\n", " )\n", " models[model_name][\"RMAE_test\"] = math.sqrt(\n", " metrics.mean_absolute_error(density_y_test, y_test_pred)\n", " )\n", " models[model_name][\"R2_test\"] = metrics.r2_score(density_y_test, y_test_pred)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Вывод результатов оценки" ] }, { "cell_type": "code", "execution_count": 151, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
 RMSE_trainRMSE_testRMAE_testR2_test
linear_poly0.0003190.0003620.0166430.999965
linear_interact0.0011310.0014910.0331980.999413
linear0.0024640.0032610.0498910.997191
random_forest0.0027160.0055750.0672980.991788
decision_tree0.0003460.0064330.0761380.989067
ridge0.0139890.0153560.1163800.937703
knn0.0531080.0567760.2176110.148414
mlp0.0794780.0679100.247692-0.218339
\n" ], "text/plain": [ "" ] }, "execution_count": 151, "metadata": {}, "output_type": "execute_result" } ], "source": [ "reg_metrics = pd.DataFrame.from_dict(models, \"index\")[\n", " [\"RMSE_train\", \"RMSE_test\", \"RMAE_test\", \"R2_test\"]\n", "]\n", "reg_metrics.sort_values(by=\"RMSE_test\").style.background_gradient(\n", " cmap=\"viridis\", low=1, high=0.3, subset=[\"RMSE_train\", \"RMSE_test\"]\n", ").background_gradient(cmap=\"plasma\", low=0.3, high=1, subset=[\"RMAE_test\", \"R2_test\"])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Вывод реального и \"спрогнозированного\" результата для обучающей и тестовой выборок" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Получение лучшей модели" ] }, { "cell_type": "code", "execution_count": 152, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'linear_poly'" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "best_model = str(reg_metrics.sort_values(by=\"RMSE_test\").iloc[0].name)\n", "\n", "display(best_model)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Вывод для обучающей выборки" ] }, { "cell_type": "code", "execution_count": 153, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TAl2O3TiO2DensityDensityPred
0200.00.01.062501.063174
1250.00.01.059791.060117
2350.00.01.054041.053941
3400.00.01.051031.050822
4450.00.01.047941.047683
\n", "
" ], "text/plain": [ " T Al2O3 TiO2 Density DensityPred\n", "0 20 0.0 0.0 1.06250 1.063174\n", "1 25 0.0 0.0 1.05979 1.060117\n", "2 35 0.0 0.0 1.05404 1.053941\n", "3 40 0.0 0.0 1.05103 1.050822\n", "4 45 0.0 0.0 1.04794 1.047683" ] }, "execution_count": 153, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat(\n", " [\n", " density_train,\n", " density_y_train,\n", " pd.Series(\n", " models[best_model][\"train_preds\"],\n", " index=density_y_train.index,\n", " name=\"DensityPred\",\n", " ),\n", " ],\n", " axis=1,\n", ").head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Вывод для тестовой выборки" ] }, { "cell_type": "code", "execution_count": 154, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TAl2O3TiO2DensityDensityPred
0300.000.01.056961.057040
1550.000.01.041581.041341
2250.050.01.084381.084063
3300.050.01.081121.080764
4350.050.01.077811.077444
\n", "
" ], "text/plain": [ " T Al2O3 TiO2 Density DensityPred\n", "0 30 0.00 0.0 1.05696 1.057040\n", "1 55 0.00 0.0 1.04158 1.041341\n", "2 25 0.05 0.0 1.08438 1.084063\n", "3 30 0.05 0.0 1.08112 1.080764\n", "4 35 0.05 0.0 1.07781 1.077444" ] }, "execution_count": 154, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat(\n", " [\n", " density_test,\n", " density_y_test,\n", " pd.Series(\n", " models[best_model][\"preds\"],\n", " index=density_y_test.index,\n", " name=\"DensityPred\",\n", " ),\n", " ],\n", " axis=1,\n", ").head(5)" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.7" } }, "nbformat": 4, "nbformat_minor": 2 }