781 lines
214 KiB
Plaintext
Raw Normal View History

2024-11-15 21:25:43 +04:00
{
"cells": [
{
"cell_type": "code",
2024-11-15 21:26:00 +04:00
"execution_count": 1,
2024-11-15 21:25:43 +04:00
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.pipeline import make_pipeline\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.preprocessing import PolynomialFeatures\n",
"from sklearn.metrics import mean_squared_error\n",
"import matplotlib.pyplot as plt\n",
"from scipy import stats\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.tree import DecisionTreeRegressor\n",
"from sklearn.ensemble import GradientBoostingRegressor\n",
"from sklearn import linear_model, tree, neighbors, ensemble, neural_network\n",
"\n",
"df = pd.read_csv(\"..//static//csv//balanced_neo.csv\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## **1-я бизнес-цель (регрессия)**: \n",
"\n",
"Предсказание скорости космического объекта для принятия решения о том, насколько опасным он может быть и стоит ли вести за ним наблюдения"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Целевой признак: скорость космического объекта relative_velocity\n",
"\n",
"Вход: минимальный радиус est_diameter_min, максимальный радиус est_diameter_max, яркость объекта absolute_magnitude, расстояние от Земли miss_distance"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Достижимый уровень качества: предсказания должны иметь погрешность в среднем не более 10000 км/с. Для проверки будет использоваться метрика MAE (средняя абсолютная ошибка)"
]
},
{
"cell_type": "code",
2024-11-15 21:26:00 +04:00
"execution_count": 2,
2024-11-15 21:25:43 +04:00
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.discriminant_analysis import StandardScaler\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.model_selection import GridSearchCV, train_test_split\n",
"from sklearn.metrics import roc_auc_score, confusion_matrix, accuracy_score\n",
"from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n",
"import seaborn as sns\n",
"from sklearn.model_selection import cross_val_predict\n",
"from sklearn.metrics import mean_squared_error\n",
"import numpy as np\n",
"from sklearn import metrics\n",
"import sklearn.preprocessing as preproc\n",
"from sklearn.linear_model import LinearRegression, Ridge\n",
"from sklearn.metrics import mean_absolute_error\n",
"from mlxtend.evaluate import bias_variance_decomp\n",
"from sklearn.neural_network import MLPRegressor\n",
"\n",
"# Загрузка данных\n",
"df = pd.read_csv(\"..//static//csv//balanced_neo.csv\")\n",
"data = df[['est_diameter_min', 'est_diameter_max', 'absolute_magnitude', 'miss_distance', 'relative_velocity']]\n",
"\n",
"X = data.drop('relative_velocity', axis=1)\n",
"y = data['relative_velocity']\n",
"\n",
"# Разделение данных на обучающую и тестовую выборки\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
"\n",
"# Преобразование числовых данных\n",
"#заполнение пустых значений медианой\n",
"num_imputer = SimpleImputer(strategy=\"median\")\n",
"\n",
"preprocessing_num = Pipeline(\n",
" [\n",
" (\"imputer\", num_imputer)\n",
" ]\n",
")\n",
"\n",
"#Категориальных данных нет, поэтому преобразовывать их не надо\n",
"\n",
"\n",
"# Общая предобработка (только числовые данные)\n",
"preprocessing = ColumnTransformer(\n",
" [\n",
" (\"nums\", preprocessing_num, X.columns)\n",
" ]\n",
")\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Линейная регрессия"
]
},
{
"cell_type": "code",
2024-11-15 21:26:00 +04:00
"execution_count": 3,
2024-11-15 21:25:43 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Лучшие гиперпараметры: {'preprocessing': MinMaxScaler()}\n",
"Cредняя абсолютная ошибка (MAE) = 19241.554618019443\n",
"Смещение: 616083845.5088656\n",
"Дисперсия: 438598.9204950822\n"
]
}
],
"source": [
"pipeline_lin_reg = Pipeline([\n",
" ('preprocessing', preprocessing),\n",
" ('model', LinearRegression())]\n",
")\n",
"\n",
"# Определение сетки гиперпараметров (возможных знач-ий гиперпараметров) для перебора\n",
"param_grid = {\n",
" #как будут масштабироваться признаки\n",
" 'preprocessing': [StandardScaler(), preproc.MinMaxScaler(), preproc.MaxAbsScaler(), None]\n",
"}\n",
"\n",
"# Создание объекта GridSearchCV для поиска лучших гиперпараметров по сетке с максимальным знач-ием \n",
"# отрицательного корня из среднеквадратичной ошибки (отриц., чтобы искался не минимум, а максимум)\n",
"grid_search = GridSearchCV(pipeline_lin_reg, param_grid, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)\n",
"\n",
"# Обучение модели с перебором гиперпараметров\n",
"grid_search.fit(X_train, y_train)\n",
"\n",
"print(\"Лучшие гиперпараметры: \", grid_search.best_params_)\n",
"\n",
"# Лучшая модель лин. регрессии\n",
"best_model = grid_search.best_estimator_\n",
"\n",
"y_pred = best_model.predict(X_test)\n",
"\n",
"print(f'Cредняя абсолютная ошибка (MAE) = {mean_absolute_error(y_test, y_pred)}')\n",
"\n",
"\n",
"mse, bias, variance = bias_variance_decomp(best_model, X_train.values, y_train.values, X_test.values, y_test.values, loss='mse', num_rounds=200, random_seed=123)\n",
"print(\"Смещение: \", bias)\n",
"print(\"Дисперсия: \", variance)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Гребневая регрессия"
]
},
{
"cell_type": "code",
2024-11-15 21:26:00 +04:00
"execution_count": 4,
2024-11-15 21:25:43 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Лучшие гиперпараметры: {'model__alpha': 10.0, 'preprocessing': MinMaxScaler()}\n",
"Cредняя абсолютная ошибка (MAE) = 19239.098934204343\n",
"Смещение: 615921619.3705255\n",
"Дисперсия: 326886.495836047\n"
]
}
],
"source": [
"pipeline_ridge = Pipeline([\n",
" ('preprocessing', preprocessing),\n",
" ('model', Ridge())]\n",
")\n",
"\n",
"# Определение сетки гиперпараметров (возможных знач-ий гиперпараметров) для перебора\n",
"param_grid = {\n",
" #как будут масштабироваться признаки\n",
" 'preprocessing': [StandardScaler(), preproc.MinMaxScaler(), preproc.MaxAbsScaler(), None],\n",
" #сила регуляризации\n",
" 'model__alpha': [0, 0.5, 1.0, 1.5, 2.0, 5.0, 10.0] \n",
"}\n",
"\n",
"# Создание объекта GridSearchCV для поиска лучших гиперпараметров по сетке с максимальным знач-ием \n",
"# отрицательного корня из среднеквадратичной ошибки (отриц., чтобы искался не минимум, а максимум)\n",
"grid_search = GridSearchCV(pipeline_ridge, param_grid, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1, verbose=0)\n",
"\n",
"# Обучение модели с перебором гиперпараметров\n",
"grid_search.fit(X_train, y_train)\n",
"\n",
"print(\"Лучшие гиперпараметры: \", grid_search.best_params_)\n",
"\n",
"# Лучшая модель регрессии\n",
"best_model = grid_search.best_estimator_\n",
"\n",
"y_pred = best_model.predict(X_test)\n",
"\n",
"print(f'Cредняя абсолютная ошибка (MAE) = {mean_absolute_error(y_test, y_pred)}')\n",
"\n",
"mse, bias, variance = bias_variance_decomp(best_model, X_train.values, y_train.values, X_test.values, y_test.values, loss='mse', num_rounds=200, random_seed=123)\n",
"print(\"Смещение: \", bias)\n",
"print(\"Дисперсия: \", variance)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Гребневая регрессия показала почти такие же результаты, что и линейная регрессия"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Метод градиентного бустинга (набор деревьев решений)"
]
},
{
"cell_type": "code",
2024-11-15 21:26:00 +04:00
"execution_count": 5,
2024-11-15 21:25:43 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-11-15 21:26:00 +04:00
"Лучшие гиперпараметры: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 100, 'preprocessing': MaxAbsScaler()}\n",
"Cредняя абсолютная ошибка (MAE) = 18908.8586008017\n",
"Смещение: -4.040641051222239\n",
"Дисперсия: 162335195.36332467\n"
2024-11-15 21:25:43 +04:00
]
}
],
"source": [
"# Конвейер\n",
"pipeline_grad = Pipeline([\n",
" ('preprocessing', preprocessing),\n",
" ('model', GradientBoostingRegressor())\n",
"])\n",
"\n",
"# Определение сетки гиперпараметров\n",
"param_grid = {\n",
" 'preprocessing': [StandardScaler(), preproc.MinMaxScaler(), preproc.MaxAbsScaler(), None],\n",
" 'model__n_estimators': [100, 200, 300],\n",
" #Скорость обучения\n",
" 'model__learning_rate': [0.1, 0.2],\n",
" #Максимальная глубина дерева\n",
" 'model__max_depth': [3, 5, 7]\n",
"}\n",
"\n",
"# Создание объекта GridSearchCV\n",
"grid_search = GridSearchCV(pipeline_grad, param_grid, cv=2, scoring='neg_root_mean_squared_error', n_jobs=-1)\n",
"\n",
"# Обучение модели с перебором гиперпараметров\n",
"grid_search.fit(X_train, y_train)\n",
"\n",
"print(\"Лучшие гиперпараметры: \", grid_search.best_params_)\n",
"\n",
"# Лучшая модель случайного леса\n",
"best_model = grid_search.best_estimator_\n",
"\n",
"\n",
"y_pred = best_model.predict(X_test)\n",
"\n",
"\n",
"print(f'Cредняя абсолютная ошибка (MAE) = {mean_absolute_error(y_test, y_pred)}')\n",
"\n",
"\n",
"# Получение предсказаний на кросс-валидации\n",
"y_cv_pred = cross_val_predict(best_model, X_train, y_train, cv=3)\n",
"\n",
"# Оценка смещения\n",
"bias = np.mean(y_cv_pred - y_train)\n",
"\n",
"# Оценка дисперсии\n",
"variance = np.var(y_cv_pred)\n",
"\n",
"print(f\"Смещение: {bias}\")\n",
"print(f\"Дисперсия: {variance}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Вывод**: \n",
"\n",
"Все 3 модели регрессии не показали необходимого уровня \"погрешности\". Это означает, что необходимо использовать более сложные модели или что по доступным данным нельзя достичь необходимой погрешности.\n",
"\n",
"Из всех моделей градиентный бустинг показал самую низкую \"погрешность\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## **2-я бизнес-цель (классификация):** \n",
"\n",
"Определение опасности космиеского объекта для увеличения безопасности Земли"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Целевой признак: опасность объекта hazardous\n",
"\n",
"Вход: минимальный радиус est_diameter_min, максимальный радиус est_diameter_max, яркость объекта absolute_magnitude, скорость relative_velocity"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Достижимый уровень качества: необходимо, чтобы точность предсказания модели составляла не менее 90%. Для проверки этого будет использована метрика Accuracy"
]
},
{
"cell_type": "code",
2024-11-15 21:26:00 +04:00
"execution_count": 6,
2024-11-15 21:25:43 +04:00
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.discriminant_analysis import StandardScaler\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.preprocessing import OneHotEncoder\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.model_selection import GridSearchCV, train_test_split\n",
"from sklearn.metrics import roc_auc_score, confusion_matrix, accuracy_score\n",
"from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n",
"import seaborn as sns\n",
"from sklearn.model_selection import cross_val_predict\n",
"from sklearn.metrics import mean_squared_error\n",
"import numpy as np\n",
"from sklearn import metrics\n",
"# Загрузка данных\n",
"df = pd.read_csv(\"..//static//csv//balanced_neo.csv\")\n",
"data = df[['est_diameter_min', 'est_diameter_max', 'absolute_magnitude', 'relative_velocity', 'hazardous']]\n",
"\n",
"X = data.drop('hazardous', axis=1)\n",
"y = data['hazardous']\n",
"\n",
"# Разделение данных на обучающую и тестовую выборки\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
"\n",
"# Преобразование числовых данных\n",
"num_imputer = SimpleImputer(strategy=\"median\")\n",
"num_scaler = StandardScaler()\n",
"preprocessing_num = Pipeline(\n",
" [\n",
" (\"imputer\", num_imputer),\n",
" (\"scaler\", num_scaler),\n",
" ]\n",
")\n",
"\n",
"# Общая предобработка (только числовые данные)\n",
"preprocessing = ColumnTransformer(\n",
" [\n",
" (\"nums\", preprocessing_num, X.columns),\n",
" ]\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Логистическая регрессия"
]
},
{
"cell_type": "code",
2024-11-15 21:26:00 +04:00
"execution_count": 7,
2024-11-15 21:25:43 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Лучшие гиперпараметры: {'classifier__C': 0.1, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear'}\n",
2024-11-15 21:26:00 +04:00
"ROC у логистической регрессии = 0.8670873798838691\n",
2024-11-15 21:25:43 +04:00
"Точность = 0.8591628959276018\n"
]
},
{
"data": {
2024-11-15 21:26:00 +04:00
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjcAAAGwCAYAAABVdURTAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA6aElEQVR4nO3df5zNdf7///uc4cwvZrCzZgZTQz+UJUJ8RspbpqZfYttKsUwqbcVkzaoQhpKxFWlRlpLqXUtalUW8M8UiuwrTL2IxIsxoVs0wmOGc5/cPX6eZMaM54/x8ndv1cjmXi/Oa1+ucx3n5ce6er8fr+QwzxhgBAABYhM3fBQAAAHgS4QYAAFgK4QYAAFgK4QYAAFgK4QYAAFgK4QYAAFgK4QYAAFhKPX8X4GtOp1MHDhxQw4YNFRYW5u9yAABALRhjdOTIETVr1kw227nHZkIu3Bw4cEDJycn+LgMAANTBvn371KJFi3PuE3LhpmHDhpJOn5zY2Fg/VwMAAGqjpKREycnJru/xcwm5cHPmUlRsbCzhBgCAIFOblhIaigEAgKUQbgAAgKUQbgAAgKUQbgAAgKUQbgAAgKUQbgAAgKUQbgAAgKUQbgAAgKUQbgAAgKUQbgAAgKX4Ndz885//VO/evdWsWTOFhYXp/fff/8VjVq9erY4dOyoiIkIXX3yx5s+f7/U6AQBA8PBruCktLVX79u01a9asWu2fn5+vW265RT179lReXp7++Mc/6oEHHtDKlSu9XCkAAAgWfl0486abbtJNN91U6/1nz56tli1baurUqZKkyy+/XOvWrdMLL7yg9PR0b5UJAIDbjDE6ftLh7zL8Jqp+eK0WufSGoFoVfMOGDUpLS6u0LT09XX/84x9rPKasrExlZWWu5yUlJd4qDwhoof4PLeBLxkh3zt6grQdD9ztn61Ppirb7J2YEVbgpKChQQkJCpW0JCQkqKSnR8ePHFRUVddYxOTk5mjhxoq9KRAgJprDAP7QAQklQhZu6GD16tLKyslzPS0pKlJyc7MeKEKwqhhnCAoDaaJMUq0UPpcpPV2f8Kqp+uN/eO6jCTWJiogoLCyttKywsVGxsbLWjNpIUERGhiIgIX5QHC6k6KmOVMBPK/9AC/uDPvpNQFlThJjU1VcuXL6+07aOPPlJqaqqfKoLVGGN0rNxR6yATbGGBf2gBhAK/hpujR49q586druf5+fnKy8tTkyZNdMEFF2j06NHav3+/3njjDUnSQw89pJkzZ+rxxx/Xfffdp48//ljvvPOOli1b5q+PAAtxOo1unbHunKGmapghLABA4PFruPn888/Vs2dP1/MzvTEZGRmaP3++Dh48qL1797p+3rJlSy1btkwjRozQiy++qBYtWuiVV17hNnCcN6fTqNe0NcovKnVtq25UhjADAIEvzBhj/F2EL5WUlCguLk7FxcWKjY31dznwszOXoW6dsc4VbFrGx2hpZndF2wkyABAo3Pn+DqqeG8BTauqtaRkfo9ysHrLZCDUAEKwINwg5NfXWtEmK1dLM7gQbAAhyhBuEFGPODjZnemu4DAUA1kC4QUg5Vu5wBRt6awDAmgg3CBnGGN05e4Pr+dLM7oqJ4K8AAFiNzd8FAL5ScdSmTVKsou3+mxocAOA9hBuEhKqjNqfnr+FSFABYEeEGIeH4SUZtACBUEG4Qchi1AQBrI9wg5JBrAMDaCDcAAMBSCDcAAMBSCDcICaG1PCwAhDbCDSyv6m3gAABrI9zA8qpO3hdVn9vAAcDKCDewtDMrgJ/BbeAAYH2EG1iW02nUa9oa5ReVSmLyPgAIFYQbWJIxp0dszgSbMyuAM2oDANZHuIElVVxuoWV8jHKzeshmI9gAQCgg3MCSKt76vTSzO8EGAEII4QaWU/XWb65EAUBoIdzAcqquAM6t3wAQWgg3sDRu/QaA0EO4gaWRawAg9BBuYDmsIwUAoY1wA0upOiMxACD0EG5gGVUn7qOZGABCE+EGllF14j5mJAaA0ES4gSUxcR8AhC7CDSyjYiMxAzYAELoIN7CEqrMSAwBCF+EGlsCsxACAMwg3sBxmJQaA0Ea4gSXQbwMAOINwg6BHvw0AoCLCDYIe/TYAgIoIN7AU+m0AAIQbBD36bQAAFRFuENTotwEAVEW4QVCj3wYAUBXhBpZBvw0AQCLcwELINQAAiXCDIFexmRgAAIlwgyDmdBrdOmOdv8sAAAQYwg2CktNp1GvaGuUXlUqimRgA8DPCDYKOMadHbM4Em5bxMVqa2Z1mYgCAJMINgtCx8p9v/24ZH6PcrB6y2Qg2AIDTCDcIKlUn7Vua2Z1gAwCohHCDoFJ10r5oO302AIDKCDcIWkzaBwCoDuEGQYVFMgEAv4Rwg6DBIpkAgNog3CBosEgmAKA2CDcISvTbAABqQrhBUCLXAABqQrgBAACWQrgBAACW4vdwM2vWLKWkpCgyMlJdu3bVxo0bz7n/9OnT1bp1a0VFRSk5OVkjRozQiRMnfFQtAAAIdH4NNwsXLlRWVpays7O1efNmtW/fXunp6Tp06FC1+7/99tsaNWqUsrOztW3bNr366qtauHChxowZ4+PK4Q8V57gBAKAmfg0306ZN05AhQzR48GC1adNGs2fPVnR0tObNm1ft/p9++qmuvvpq9e/fXykpKbrhhht0zz33nHO0p6ysTCUlJZUeCD7McQMAqC2/hZvy8nJt2rRJaWlpPxdjsyktLU0bNlT/JdatWzdt2rTJFWZ2796t5cuX6+abb67xfXJychQXF+d6JCcne/aDwCeY4wYAUFv1/PXGRUVFcjgcSkhIqLQ9ISFB3377bbXH9O/fX0VFRerevbuMMTp16pQeeuihc16WGj16tLKyslzPS0pKCDhBjjluAADn4veGYnesXr1akydP1ksvvaTNmzdr8eLFWrZsmZ5++ukaj4mIiFBsbGylB4IPa0oBAGrLbyM38fHxCg8PV2FhYaXthYWFSkxMrPaYcePGaeDAgXrggQckSe3atVNpaakefPBBPfnkk7LZgiqroZbotwEAuMNvacBut6tTp07Kzc11bXM6ncrNzVVqamq1xxw7duysABMefrr3wnArjWXRbwMAcIffRm4kKSsrSxkZGercubO6dOmi6dOnq7S0VIMHD5YkDRo0SM2bN1dOTo4kqXfv3po2bZquvPJKde3aVTt37tS4cePUu3dvV8iBtdFvAwD4JX4NN/369dMPP/yg8ePHq6CgQB06dNCKFStcTcZ79+6tNFIzduxYhYWFaezYsdq/f79+/etfq3fv3nrmmWf89RHgA/TbAADcEWZC7HpOSUmJ4uLiVFxcTHNxEDDG6Ja/rHNdltr6VLqi7X7N5AAAP3Dn+5sOXAQ0+m0AAO4i3CBo0G8DAKgNwg0CGv02AAB3EW4QsJjfBgBQF4QbBKxj5fTbAADcR7hBQHI6jW6dsc71nH4bAEBtEW4QcIw5HWzyi0olnR61ibYzagMAqB3CDQJOxdu/W8bHaGlmd0ZtAAC1RrhBQFua2V02G8EGAFB7hBsEHG7/BgCcD8INAgq3fwMAzhfhBgGF5RYAAOeLcIOAxe3fAIC6INwgoNBvAwA4X4QbBAz6bQAAnkC4QcBguQUAgCcQbhAQqo7a0G8DAKgrwg0CQtVRG5ZbAADUFeEGfseoDQDAkwg38Luqc9swagMAOB+EGwQURm0AAOeLcIOAQq4BAJwvwg0AALAUwg38ruKsxAAAnC/CDfyKWYkBAJ5GuIFfsQo4AMDTCDcIGNwpBQDwBMINAga5BgDgCYQbAABgKYQbAABgKYQbAABgKYQb+BVz3AAAPI1wA79hjhsAgDecV7g5ceKEp+pACDpWzhw3AADPczvcOJ1OPf3002revLkaNGig3bt3S5LGjRunV1991eMFwpqqjtowxw0AwFPcDjeTJk3S/Pnz9eyzz8put7u2t23bVq+88opHi4N1VZ2ZONrOqA0AwDPcDjdvvPGG5syZowEDBig8/OcvpPbt2+vbb7/1aHEIDYzaAAA8ye1ws3//fl188cVnbXc6nTp58qRHikJoIdcAADzJ7XDTpk0brV279qzt777
2024-11-15 21:25:43 +04:00
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAApIAAAIjCAYAAACwHvu2AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAB2x0lEQVR4nO3deVxU5fv/8dcgsggCoiJiLrjjvpXhblLmvn7KMsWlzTRzXyrNNQpzSS3NTFHTzNxSK9M0JY3ctxRNUyMX1EREXJBlfn/4Y75OqMHIcBDez8/jPD7Mfe65z3UGHa+uc5/7mMxmsxkRERERkQxyMDoAEREREXk0KZEUEREREZsokRQRERERmyiRFBERERGbKJEUEREREZsokRQRERERmyiRFBERERGbKJEUEREREZsokRQRERERmyiRFJEHOn78OM888wyenp6YTCZWr16dqeOfPn0ak8lEWFhYpo77KGvSpAlNmjQxOgwRkf+kRFLkEfDnn3/y2muvUbp0aVxcXPDw8KB+/fp8/PHH3Lx5067HDg4O5tChQ0ycOJFFixZRp04dux4vK/Xo0QOTyYSHh8c9P8fjx49jMpkwmUx89NFHGR7/3LlzjBkzhv3792dCtCIi2Y+j0QGIyIN99913/O9//8PZ2Znu3btTpUoVbt++zbZt2xg6dCiHDx9mzpw5djn2zZs3iYiI4J133qFfv352OUbJkiW5efMmefPmtcv4/8XR0ZEbN26wdu1annvuOat9ixcvxsXFhVu3btk09rlz5xg7diylSpWiRo0a6X7fhg0bbDqeiEhWUyIpko2dOnWKLl26ULJkSTZv3kzRokUt+/r27cuJEyf47rvv7Hb8S5cuAeDl5WW3Y5hMJlxcXOw2/n9xdnamfv36fPXVV2kSySVLltCqVStWrFiRJbHcuHGDfPny4eTklCXHExF5WLq0LZKNhYaGEh8fzxdffGGVRKYqW7Ysb731luV1UlIS48ePp0yZMjg7O1OqVCnefvttEhISrN5XqlQpWrduzbZt23jiiSdwcXGhdOnSLFy40NJnzJgxlCxZEoChQ4diMpkoVaoUcOeScOrPdxszZgwmk8mqbePGjTRo0AAvLy/c3d2pUKECb7/9tmX//eZIbt68mYYNG+Lm5oaXlxft2rUjMjLynsc7ceIEPXr0wMvLC09PT3r27MmNGzfu/8H+y4svvsgPP/xAbGyspW3Xrl0cP36cF198MU3/mJgYhgwZQtWqVXF3d8fDw4MWLVpw4MABS58tW7bw+OOPA9CzZ0/LJfLU82zSpAlVqlRhz549NGrUiHz58lk+l3/PkQwODsbFxSXN+Tdv3pwCBQpw7ty5dJ+riEhmUiIpko2tXbuW0qVLU69evXT1f/nllxk9ejS1atVi6tSpNG7cmJCQELp06ZKm74kTJ+jcuTNPP/00kydPpkCBAvTo0YPDhw8D0LFjR6ZOnQrACy+8wKJFi5g2bVqG4j98+DCtW7cmISGBcePGMXnyZNq2bcv27dsf+L6ffvqJ5s2bc/HiRcaMGcOgQYP49ddfqV+/PqdPn07T/7nnnuPatWuEhITw3HPPERYWxtixY9MdZ8eOHTGZTKxcudLStmTJEipWrEitWrXS9D958iSrV6+mdevWTJkyhaFDh3Lo0CEaN25sSeoCAgIYN24cAK+++iqLFi1i0aJFNGrUyDLO5cuXadGiBTVq1GDatGk0bdr0nvF9/PHHFC5cmODgYJKTkwH47LPP2LBhAzNmzMDPzy/d5yoikqnMIpItXb161QyY27Vrl67++/fvNwPml19+2ap9yJAhZsC8efNmS1vJkiXNgDk8PNzSdvHiRbOzs7N58ODBlrZTp06ZAfOkSZOsxgwODjaXLFkyTQzvvfee+e6vlalTp5oB86VLl+4bd+ox5s+fb2mrUaOG2cfHx3z58mVL24EDB8wODg7m7t27pzler169rMbs0KGDuWDBgvc95t3n4ebmZjabzebOnTubmzVrZjabzebk5GSzr6+veezYsff8DG7dumVOTk5Ocx7Ozs7mcePGWdp27dqV5txSNW7c2AyYZ8+efc99jRs3tmr78ccfzYB5woQJ5pMnT5rd3d3N7du3/89zFBGxJ1UkRbKpuLg4APLnz5+u/t9//z0AgwYNsmofPHgwQJq5lJUqVaJhw4aW14ULF6ZChQqcPHnS5pj/LXVu5bfffktKSkq63nP+/Hn2799Pjx498Pb2trRXq1aNp59+2nKed3v99detXjds2JDLly9bPsP0ePHFF9myZQvR0dFs3ryZ6Ojoe17WhjvzKh0c7nx9Jicnc/nyZctl+71796b7mM7OzvTs2TNdfZ955hlee+01xo0bR8eOHXFxceGzzz5L97FEROxBiaRINuXh4QHAtWvX0tX/r7/+wsHBgbJly1q1+/r64uXlxV9//WXVXqJEiTRjFChQgCtXrtgYcVrPP/889evX5+WXX6ZIkSJ06dKFZcuWPTCpTI2zQoUKafYFBATwzz//cP36dav2f59LgQIFADJ0Li1btiR//vx8/fXXLF68mMcffzzNZ5kqJSWFqVOnUq5cOZydnSlUqBCFCxfm4MGDXL16Nd3HLFasWIZurPnoo4/w9vZm//79TJ8+HR8fn3S/V0TEHpRIimRTHh4e+Pn58fvvv2foff++2eV+8uTJc892s9ls8zFS5++lcnV1JTw8nJ9++olu3bpx8OBBnn/+eZ5++uk0fR/Gw5xLKmdnZzp27MiCBQtYtWrVfauRAO+//z6DBg2iUaNGfPnll/z4449s3LiRypUrp7vyCnc+n4zYt28fFy9eBODQoUMZeq+IiD0okRTJxlq3bs2ff/5JRETEf/YtWbIkKSkpHD9+3Kr9woULxMbGWu7AzgwFChSwusM51b+rngAODg40a9aMKVOmcOTIESZOnMjmzZv5+eef7zl2apzHjh1Ls+/o0aMUKlQINze3hzuB+3jxxRfZt28f165du+cNSqmWL19O06ZN+eKLL+jSpQvPPPMMQUFBaT6T9Cb16XH9+nV69uxJpUqVePXVVwkNDWXXrl2ZNr6IiC2USIpkY8OGDcPNzY2XX36ZCxcupNn/559/8vHHHwN3Ls0Cae6snjJlCgCtWrXKtLjKlCnD1atXOXjwoKXt/PnzrFq1yqpfTExMmvemLsz97yWJUhUtWpQaNWqwYMECq8Ts999/Z8OGDZbztIemTZsyfvx4Zs6cia+v73375cmTJ02185tvvuHs2bNWbakJ772S7owaPnw4UVFRLFiwgClTplCqVCmCg4Pv+zmKiGQFLUguko2VKVOGJUuW8PzzzxMQEGD1ZJtff/2Vb775hh49egBQvXp1goODmTNnDrGxsTRu3JidO3eyYMEC2rdvf9+lZWzRpUsXhg8fTocOHejfvz83btxg1qxZlC9f3upmk3HjxhEeHk6rVq0oWbIkFy9e5NNPP+Wxxx6jQYMG9x1/0qRJtGjRgsDAQHr37s3NmzeZMWMGnp6ejBkzJtPO498cHBx49913/7Nf69atGTduHD179qRevXocOnSIxYsXU7p0aat+ZcqUwcvLi9mzZ5M/f37c3NyoW7cu/v7+GYpr8+bNfPrpp7z33nuW5Yjmz59PkyZNGDVqFKGhoRkaT0Qks6giKZLNtW3bloMHD9K5c2e+/fZb+vbty4gRIzh9+jSTJ09m+vTplr5z585l7Nix7Nq1iwEDBrB582ZGjhzJ0qVLMzWmggULsmrVKvLly8ewYcNYsGABISEhtGnTJk3sJUqUYN68efTt25dPPvmERo0asXnzZjw9Pe87flBQEOvXr6dgwYKMHj2ajz76iCeffJLt27dnOAmzh7fffpvBgwfz448/8tZbb7F3716+++47ihcvbtUvb968LFiwgDx58vD666/zwgsvsHXr1gwd69q1a/Tq1YuaNWvyzjvvWNobNmzIW2+9xeTJk/ntt98y5bxERDLKZM7IbHQRERERkf9PFUkRERERsYkSSRERERGxiRJJEREREbGJEkkRERERsYkSSRERERGxiRJJEREREbGJEkkRERERsUmOfLJ
"text/plain": [
"<Figure size 800x600 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-11-15 21:26:00 +04:00
"Смещение: 0.8529941124698746\n",
"Дисперсия: 0.0065558753718589465\n"
2024-11-15 21:25:43 +04:00
]
}
],
"source": [
"# Конвейер для логистической регрессии\n",
"pipeline_logreg = Pipeline([\n",
" ('preprocessing', preprocessing),\n",
" ('classifier', LogisticRegression())\n",
"])\n",
"\n",
"# Определение сетки гиперпараметров (возможных знач-ий гиперпараметров) для перебора\n",
"param_grid = {\n",
" # Параметр регуляризации (сила регуляризации), чем меньше, тем сильнее регуляризация\n",
" 'classifier__C': [0.1, 0.5, 1],\n",
" # Тип регуляризации (ф-ия штрафов)\n",
" 'classifier__penalty': ['l1', 'l2'],\n",
" # Решатель (сам алгоритм?)\n",
" 'classifier__solver': ['liblinear', 'saga']\n",
"}\n",
"\n",
"# Создание объекта GridSearchCV для поиска лучших гиперпараметров по сетке с максимальным знач-ием ROC-кривой\n",
"grid_search = GridSearchCV(pipeline_logreg, param_grid, cv=5, scoring='accuracy', n_jobs=-1)\n",
"\n",
"# Обучение модели с перебором гиперпараметров\n",
"grid_search.fit(X_train, y_train)\n",
"\n",
"print(\"Лучшие гиперпараметры: \", grid_search.best_params_)\n",
"\n",
"# Лучшая модель логистической регрессии\n",
"best_model = grid_search.best_estimator_\n",
"\n",
"# Использование и оценка лучшей логистической модели\n",
"y_pred_proba = best_model.predict_proba(X_test)[:, 1]\n",
"print(f'ROC у логистической регрессии = {roc_auc_score(y_test, y_pred_proba)}')\n",
"\n",
"y_pred = best_model.predict(X_test)\n",
"print(f'Точность = {accuracy_score(y_test, y_pred)}')\n",
"\n",
"fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)\n",
"\n",
"# построение ROC кривой\n",
"plt.plot(fpr, tpr)\n",
"plt.ylabel('True Positive Rate')\n",
"plt.xlabel('False Positive Rate')\n",
"plt.show()\n",
"\n",
"# Построение матрицы ошибок\n",
"conf_matrix = confusion_matrix(y_test, y_pred)\n",
"\n",
"# Визуализация матрицы ошибок\n",
"plt.figure(figsize=(8, 6))\n",
"sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', \n",
" xticklabels=['Предсказанный \"безопасный\"', 'Предсказанный \"опасный\"'], \n",
" yticklabels=['Действительно \"безопасный\"', 'Действительно \"опасный\"'])\n",
"plt.title('Confusion Matrix')\n",
"plt.ylabel('Actual')\n",
"plt.xlabel('Predicted')\n",
"plt.show()\n",
"\n",
"# Оценка дисперсии и смещения\n",
"cv_results = grid_search.cv_results_\n",
"mean_test_score = cv_results['mean_test_score']\n",
"std_test_score = cv_results['std_test_score']\n",
"\n",
"print(f\"Смещение: {mean_test_score.mean()}\")\n",
"print(f\"Дисперсия: {std_test_score.mean()}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Метод случаного леса (набор деревьев решений)"
]
},
{
"cell_type": "code",
2024-11-15 21:26:00 +04:00
"execution_count": 8,
2024-11-15 21:25:43 +04:00
"metadata": {},
"outputs": [
2024-11-15 21:26:00 +04:00
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\AI labs\\aimenv\\Lib\\site-packages\\numpy\\ma\\core.py:2881: RuntimeWarning: invalid value encountered in cast\n",
" _data = np.array(data, dtype=dtype, copy=copy,\n"
]
},
2024-11-15 21:25:43 +04:00
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-11-15 21:26:00 +04:00
"Лучшие гиперпараметры: {'classifier__max_depth': 10, 'classifier__min_samples_leaf': 4, 'classifier__n_estimators': 200}\n",
"ROC у метода случайного леса = 0.9009594886141752\n",
"Точность = 0.8721719457013575\n"
2024-11-15 21:25:43 +04:00
]
},
{
"data": {
2024-11-15 21:26:00 +04:00
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjcAAAGwCAYAAABVdURTAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABDcklEQVR4nO3de5zM9eLH8ffsbXaxu2jbXdZq3S+RaxwkB1tUR9Q5pfghlc7JJYduCEsJp4t0SjmUpFORrk7EQQjpKGwREVauu6zLrt1ld818f3/IV5vFzpqZ78zs6/l4zKPP97vf78x7vjbz9p3vxWYYhiEAAIAAEWR1AAAAAHei3AAAgIBCuQEAAAGFcgMAAAIK5QYAAAQUyg0AAAgolBsAABBQQqwO4G1Op1MHDx5UZGSkbDab1XEAAEAJGIahkydPqmrVqgoKuvS+mTJXbg4ePKjExESrYwAAgFLYt2+fqlWrdsllyly5iYyMlHR240RFRVmcBgAAlER2drYSExPNz/FLKXPl5txXUVFRUZQbAAD8TEkOKeGAYgAAEFAoNwAAIKBQbgAAQECh3AAAgIBCuQEAAAGFcgMAAAIK5QYAAAQUyg0AAAgolBsAABBQKDcAACCgWFpuvvrqK3Xr1k1Vq1aVzWbTp59+etl1Vq5cqebNm8tut6t27dqaPXu2x3MCAAD/YWm5yc3NVZMmTTRt2rQSLZ+WlqbbbrtNHTt2VGpqqv7+97/rwQcf1JIlSzycFAAA+AtLb5x5yy236JZbbinx8tOnT1eNGjX04osvSpIaNGigNWvW6KWXXlKXLl08FRNwmWEYOpFXqNyCM1ZHAQCvCwsJUmxkuGWv71d3BV+3bp2Sk5OLzOvSpYv+/ve/X3Sd/Px85efnm9PZ2dmeiocAc7rQoR0ZJ7Vp7wl998txnTxdqF1HcrTv2ClFhV/6f53ThU4VOJxeSgoAvqV59Yr6eGA7y17fr8pNenq64uLiisyLi4tTdna2Tp06pYiIiAvWmTRpksaPH++tiAgAb6zerTfXpOlQ1umLLpN9umR7ZMJCgmRzVzAA8BOhwdaer+RX5aY0Ro4cqeHDh5vT2dnZSkxMtDARfNkvR3M1YeG2IvPKhwUrt8ChNjWvUp24Cqp1dQU1rBqlq8qHXfR5wkKCdHWkXfaQYE9HBgD8jl+Vm/j4eGVkZBSZl5GRoaioqGL32kiS3W6X3W73Rjz4McMwVGPkoiLzHu9ST/e2qq7KlygxAADf41fXuWnTpo2WL19eZN7SpUvVpk0bixIhEKzbdfSCYtO6RmUN6libYgMAfsjSPTc5OTnauXOnOZ2WlqbU1FRVrlxZ1atX18iRI3XgwAHNmTNHkvS3v/1Nr776qp544gndf//9+vLLL/XBBx9o4cKFVr0F+LkpS3fon8t/LjJvz+TbLEoDAHAHS8vNd999p44dO5rT546N6devn2bPnq1Dhw5p79695s9r1KihhQsXatiwYXr55ZdVrVo1vfHGG5wGDpedyCvQuAU/6tPUg+a8f97bTLc3qWphKgCAO9gMwzCsDuFN2dnZio6OVlZWlqKioqyOAwsUOpyq89QXReYtG95BtWMrWJQIAHA5rnx++9UxN4A7/H1uapHphY/cQLEBgADiV2dLAVfqiQ+/18LNh8xpjq8BgMBDuUGZ8P2+E3rg7W+VmVNgzvvoYc6yA4BARLlBmTB07qYixWb9qM6KjbLuvicAAM+h3CDgnSpwaM/RPElSk8SK+uhvbRRi8aXBAQCew9/wCGhnHE41GLvYnP575zoUGwAIcPwtj4B278xvikzfUCfGoiQAAG/haykEpLyCM2o4dkmReWmTbpXNxj26ASDQsecGAen3xWbRI+0pNgBQRrDnBgHns9QDRabZYwMAZQt7bhBw3v3m/P3Itk/oSrEBgDKGcoOA8vR/tmr9nmOSpOE31ZU9JNjiRAAAb6PcIGDMWpOmWWvTzOlO9WMtTAMAsArH3CAgnC506OnPt5rTXz3eUdWvKmdhIgCAVdhzA79nGIaSp6wyp+c99AeKDQCUYZQb+L0h72/S/uOnzOnWNa+yMA0AwGqUG/i9z384ZI53TLjFwiQAAF9AuYFfu+k3X0d9OqidwkL4lQaAso4DiuGXHE5DtUYtKjKvaWJFa8IAAHwK/8yFX7rppfN7bKLCQ/TDuJstTAMA8CXsuYHf+c/3B7X7SK45/e3oZC7WBwAwUW7gV2atSStyPZst47tQbAAARfC1FPzKb4vNK/c2UwU7/RwAUBTlBn5j//E8czzilvrq1qSqhWkAAL6KcgO/cDy3QDf8Y4U53b9dknVhAAA+jXIDv/D55vMX6qsdW4HjbAAAF0W5gc8zDENjPt1iTn8xtL2FaQAAvo5yA5/Xc8Y35vjeVokKDebXFgBwcXxKwOetTztmjsff3sjCJAAAf0C5gU9zOg1zPO+hP3DvKADAZfFJAb9RNy7S6ggAAD9AuYFPO5h1yuoIAAA/Q7mBT9v1m3tIVSwXamESAIC/oNzAp324Yb+ks9e2sdlsFqcBAPgDyg18ltNp6D/fH5QknTxdaHEaAIC/oNzAZw2dl2qOJ97R2LogAAC/QrmBTzp44pS510aSOjeIszANAMCfUG7gc3YezlHbyV+a0y/c1cTCNAAAf0O5gc95c81uc3xD7Rj1aFrVwjQAAH8TYnUA4PdS92VJkm5uGKcZfVtanAYA4G/YcwOfs+1QtiSpdc2rLE4CAPBHlBv4rCbVoq2OAADwQ5Qb+JT73lpvjqtVKmdhEgCAv6LcwKes3H7EHMdHh1uYBADgryg38BnHcgvM8edDbrAwCQDAn1Fu4DOaP7PUHFe/iq+kAAClQ7mBTzjjcJrj1jUqKyqcO4ADAEqHcgOfcMZpmOMZfbi2DQCg9Cg38AnTVuw0x/ZQfi0BAKXHpwh8witfni834aHBFiYBAPg7yg0st/donjke1LGWhUkAAIGAcgPLbTmYZY7/2oFyAwC4MpQbWM7x68HELa+pxFlSAIArRrmB5Ya8v0mSFBRkszgJACAQUG5gqRXbD5vjhIoRFiYBAAQKyg0sNXXZz+b4H3++zsIkAIBAQbmBpb7fd0KS9EjnOgoL4dcRAHDlLP80mTZtmpKSkhQeHq7WrVtr/fr1l1x+6tSpqlevniIiIpSYmKhhw4bp9OnTXkoLd7pr+tfmuEPdGAuTAAACiaXlZt68eRo+fLhSUlK0ceNGNWnSRF26dNHhw4eLXf69997TiBEjlJKSom3btunNN9/UvHnzNGrUKC8nx5XafSRH3+45bk63uKayhWkAAIHE0nIzZcoUDRgwQP3791fDhg01ffp0lStXTrNmzSp2+a+//lrt2rVTr169lJSUpJtvvln33nvvJff25OfnKzs7u8gD1jpyMl+dXlxlTi8b3sHCNACAQGNZuSkoKNCGDRuUnJx8PkxQkJKTk7Vu3bpi12nbtq02bNhglpndu3dr0aJFuvXWWy/6OpMmTVJ0dLT5SExMdO8bgcuuf3aZOX6m+7WqHVvBwjQAgEBjWbnJzMyUw+FQXFxckflxcXFKT08vdp1evXrp6aef1g033KDQ0FDVqlVLf/zjHy/5tdTIkSOVlZVlPvbt2+fW94GSy8zJV9KIheZ00lXl1KdNknWBAAAByfIDil2xcuVKTZw4Ua+99po2btyojz/+WAsXLtQzzzxz0XXsdruioqKKPGCNh+Z8V2R6+aN/tCYIACCghVj1wjExMQoODlZGRkaR+RkZGYqPjy92nTFjxqhPnz568MEHJUmNGzdWbm6uHnroIT311FMKCvKrrlbmbNx7whynTbpVNhtXJAYAuJ9lbSAsLEwtWrTQ8uXLzXlOp1PLly9XmzZtil0nLy/vggITHBwsSTIMw3NhccWO5xaY4zn3t6LYAAA8xrI9N5I0fPhw9evXTy1btlSrVq00depU5ebmqn///pKkvn37KiEhQZMmTZIkdevWTVOmTFGzZs3UunVr7dy5U2PGjFG3bt3MkgPf1OyZpeb4DzWvsjAJACDQWVpuevbsqSNHjmjs2LFKT09X06ZNtXjxYvMg47179xbZUzN
2024-11-15 21:25:43 +04:00
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
2024-11-15 21:26:00 +04:00
"image/png": "iVBORw0KGgoAAAANSUhEUgAAApIAAAIjCAYAAACwHvu2AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAB1zUlEQVR4nO3dd1gU1/s28HtpS28qIHwVQVDB3mIQa0SxF0wUNYolGo3Gjkqixk6CDbEbC2g0xliIJRaCUSwEFUWNIrGGWBAjAiJSd98/fJmfGyywshxk70+uuS72zJkzzy5mfXzmzBmZUqlUgoiIiIiomHREB0BERERE7ycmkkRERESkFiaSRERERKQWJpJEREREpBYmkkRERESkFiaSRERERKQWJpJEREREpBYmkkRERESkFiaSRERERKQWJpJE9EbXr19Hhw4dYGFhAZlMhvDw8BId/86dO5DJZAgNDS3Rcd9nbdq0QZs2bUSHQUT0Vkwkid4DN2/exOeffw5nZ2cYGhrC3Nwcnp6eWLZsGZ4/f67Rc/v5+eHy5cuYP38+tmzZgiZNmmj0fKVp8ODBkMlkMDc3f+XneP36dchkMshkMixatKjY49+/fx+zZs1CXFxcCURLRFT26IkOgIje7MCBA/jkk08gl8sxaNAg1KlTBzk5OTh58iT8/f1x5coVrFu3TiPnfv78OaKjo/H1119jzJgxGjmHo6Mjnj9/Dn19fY2M/zZ6enrIzMzEvn370KdPH5V9W7duhaGhIbKystQa+/79+5g9ezaqVauGBg0aFPm4I0eOqHU+IqLSxkSSqAy7ffs2fH194ejoiKNHj6Jy5crSvtGjR+PGjRs4cOCAxs7/6NEjAIClpaXGziGTyWBoaKix8d9GLpfD09MTP/74Y6FEctu2bejSpQt27dpVKrFkZmbC2NgYBgYGpXI+IqJ3xUvbRGVYUFAQMjIysGHDBpUksoCLiwvGjRsnvc7Ly8PcuXNRvXp1yOVyVKtWDV999RWys7NVjqtWrRq6du2KkydP4oMPPoChoSGcnZ2xefNmqc+sWbPg6OgIAPD394dMJkO1atUAvLgkXPDzy2bNmgWZTKbSFhERgRYtWsDS0hKmpqaoWbMmvvrqK2n/6+ZIHj16FC1btoSJiQksLS3Ro0cPxMfHv/J8N27cwODBg2FpaQkLCwsMGTIEmZmZr/9g/6N///44ePAgUlNTpbazZ8/i+vXr6N+/f6H+KSkpmDx5MurWrQtTU1OYm5ujU6dOuHjxotTn2LFjaNq0KQBgyJAh0iXygvfZpk0b1KlTB7GxsWjVqhWMjY2lz+W/cyT9/PxgaGhY6P17e3vDysoK9+/fL/J7JSIqSUwkicqwffv2wdnZGc2bNy9S/88++wwzZ85Eo0aNsHTpUrRu3RqBgYHw9fUt1PfGjRv4+OOP0b59eyxevBhWVlYYPHgwrly5AgDw8fHB0qVLAQD9+vXDli1bEBwcXKz4r1y5gq5duyI7Oxtz5szB4sWL0b17d5w6deqNx/3222/w9vZGcnIyZs2ahYkTJ+L06dPw9PTEnTt3CvXv06cPnj59isDAQPTp0wehoaGYPXt2keP08fGBTCbD7t27pbZt27ahVq1aaNSoUaH+t27dQnh4OLp27YolS5bA398fly9fRuvWraWkzs3NDXPmzAEAjBgxAlu2bMGWLVvQqlUraZzHjx+jU6dOaNCgAYKDg9G2bdtXxrds2TJUqlQJfn5+yM/PBwCsXbsWR44cwfLly2Fvb1/k90pEVKKURFQmpaWlKQEoe/ToUaT+cXFxSgDKzz77TKV98uTJSgDKo0ePSm2Ojo5KAMqoqCipLTk5WSmXy5WTJk2S2m7fvq0EoFy4cKHKmH5+fkpHR8dCMXzzzTfKl79Wli5dqgSgfPTo0WvjLjjHpk2bpLYGDRoobWxslI8fP5baLl68qNTR0VEOGjSo0PmGDh2qMmavXr2UFSpUeO05X34fJiYmSqVSqfz444+V7dq1UyqVSmV+fr7Szs5OOXv27Fd+BllZWcr8/PxC70MulyvnzJkjtZ09e7bQeyvQunVrJQDlmjVrXrmvdevWKm2HDx9WAlDOmzdPeevWLaWpqamyZ8+eb32PRESaxIokURmVnp4OADAzMytS/19//RUAMHHiRJX2SZMmAUChuZTu7u5o2bKl9LpSpUqoWbMmbt26pXbM/1Uwt/KXX36BQqEo0jEPHjxAXFwcBg8eDGtra6m9Xr16aN++vfQ+XzZy5EiV1y1btsTjx4+lz7Ao+vfvj2PHjiEpKQlHjx5FUlLSKy9rAy/mVerovPj6zM/Px+PHj6XL9ufPny/yOeVyOYYMGVKkvh06dMDnn3+OOXPmwMfHB4aGhli7dm2Rz0VEpAlMJInKKHNzcwDA06dPi9T/77//ho6ODlxcXFTa7ezsYGlpib///lulvWrVqoXGsLKywpMnT9SMuLC+ffvC09MTn332GWxtbeHr64sdO3a8MaksiLNmzZqF9rm5ueHff//Fs2fPVNr/+16srKwAoFjvpXPnzjAzM8NPP/2ErVu3omnTpoU+ywIKhQJLly6Fq6sr5HI5KlasiEqVKuHSpUtIS0sr8jkdHByKdWPNokWLYG1tjbi4OISEhMDGxqbIxxIRaQITSaIyytzcHPb29vjzzz+Lddx/b3Z5HV1d3Ve2K5VKtc9RMH+vgJGREaKiovDbb79h4MCBuHTpEvr27Yv27dsX6vsu3uW9FJDL5fDx8UFYWBj27Nnz2mokACxYsAATJ05Eq1at8MMPP+Dw4cOIiIhA7dq1i1x5BV58PsVx4cIFJCcnAwAuX75crGOJiDSBiSRRGda1a1fcvHkT0dHRb+3r6OgIhUKB69evq7Q/fPgQqamp0h3YJcHKykrlDucC/616AoCOjg7atWuHJUuW4OrVq5g/fz6OHj2K33///ZVjF8SZkJBQaN+1a9dQsWJFmJiYvNsbeI3+/fvjwoULePr06StvUCqwc+dOtG3bFhs2bICvry86dOgALy+vQp9JUZP6onj27BmGDBkCd3d3jBgxAkFBQTh79myJjU9EpA4mkkRl2JQpU2BiYoLPPvsMDx8+LLT/5s2bWLZsGYAXl2YBFLqzesmSJQCALl26lFhc1atXR1paGi5duiS1PXjwAHv27FHpl5KSUujYgoW5/7skUYHKlSujQYMGCAsLU0nM/vzzTxw5ckR6n5rQtm1bzJ07FytWrICdnd1r++nq6haqdv7888+4d++eSltBwvuqpLu4pk6disTERISFhWHJkiWoVq0a/Pz8Xvs5EhGVBi5ITlSGVa9eHdu2bUPfvn3h5uam8mSb06dP4+eff8bgwYMBAPXr14efnx/WrVuH1NRUtG7dGmfOnEFYWBh69uz52qVl1OHr64upU6eiV69eGDt2LDIzM7F69WrUqFFD5WaTOXPmICoqCl26dIGjoyOSk5OxatUq/O9//0OLFi1eO/7ChQvRqVMneHh4YNiwYXj+/DmWL18OCwsLzJo1q8Tex3/p6Ohg+vTpb+3XtWtXzJkzB0OGDEHz5s1x+fJlbN26Fc7Ozir9qlevDktLS6xZswZmZmYwMTFBs2bN4OTkVKy4jh49ilWrVuGbb76RliPatGkT2rRpgxkzZiAoKKhY4xERlRRWJInKuO7du+PSpUv4+OOP8csvv2D06NGYNm0a7ty5g8WLFyMkJETqu379esyePRtnz57F+PHjcfToUQQEBGD79u0lGlOFChWwZ88eGBsbY8qUKQgLC0NgYCC6detWKPaqVati48aNGD16NFauXIlWrVrh6NGjsLCweO34Xl5eOHToECpUqICZM2di0aJF+PDDD3Hq1KliJ2Ga8NVXX2HSpEk4fPgwxo0bh/Pnz+PAgQOoUqWKSj99fX2EhYVBV1cXI0eORL9+/XD8+PFinevp06cYOnQoGjZsiK+//lpqb9myJcaNG4fFixfjjz/+KJH3RURUXDJlcWajExERERH9f6xIEhEREZFamEgSERERkVqYSBIRERGRWphIEhEREZFamEgSERERkVqYSBIRERGRWphIEhEREZFayuW
2024-11-15 21:25:43 +04:00
"text/plain": [
"<Figure size 800x600 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-11-15 21:26:00 +04:00
"Смещение: 0.8684929985365014\n",
"Дисперсия: 0.003137100883428496\n"
2024-11-15 21:25:43 +04:00
]
}
],
"source": [
"# Конвейер для случайного леса\n",
"pipeline_ranfor = Pipeline([\n",
" ('preprocessing', preprocessing),\n",
" ('classifier', RandomForestClassifier())\n",
"])\n",
"\n",
"# Определение сетки гиперпараметров\n",
"param_grid = {\n",
" #Количество деревьев в лесу\n",
" 'classifier__n_estimators': [50, 100, 200],\n",
" #Максимальная глубина дерева\n",
" 'classifier__max_depth': [10, 20, 30],\n",
" #Минимальное количество образцов для листового узла\n",
" 'classifier__min_samples_leaf': [1, 2, 4]\n",
"}\n",
"\n",
"# Создание объекта GridSearchCV\n",
"grid_search = GridSearchCV(pipeline_ranfor, param_grid, cv=5, scoring='accuracy', n_jobs=-1)\n",
"\n",
"# Обучение модели с перебором гиперпараметров\n",
"grid_search.fit(X_train, y_train)\n",
"\n",
"print(\"Лучшие гиперпараметры: \", grid_search.best_params_)\n",
"\n",
"# Лучшая модель случайного леса\n",
"best_model = grid_search.best_estimator_\n",
"\n",
"# Использование и оценка лучшей модели\n",
"y_pred_proba = best_model.predict_proba(X_test)[:, 1]\n",
"print(f'ROC у метода случайного леса = {roc_auc_score(y_test, y_pred_proba)}')\n",
"\n",
"y_pred = best_model.predict(X_test)\n",
"print(f'Точность = {accuracy_score(y_test, y_pred)}')\n",
"\n",
"fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)\n",
"\n",
"# построение ROC кривой\n",
"plt.plot(fpr, tpr)\n",
"plt.ylabel('True Positive Rate')\n",
"plt.xlabel('False Positive Rate')\n",
"plt.show()\n",
"\n",
"# Построение матрицы ошибок\n",
"conf_matrix = confusion_matrix(y_test, y_pred)\n",
"\n",
"# Визуализация матрицы ошибок\n",
"plt.figure(figsize=(8, 6))\n",
"sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', \n",
" xticklabels=['Предсказанный \"безопасный\"', 'Предсказанный \"опасный\"'], \n",
" yticklabels=['Действительно \"безопасный\"', 'Действительно \"опасный\"'])\n",
"plt.title('Confusion Matrix')\n",
"plt.ylabel('Actual')\n",
"plt.xlabel('Predicted')\n",
"plt.show()\n",
"\n",
"# Оценка дисперсии и смещения\n",
"cv_results = grid_search.cv_results_\n",
"mean_test_score = cv_results['mean_test_score']\n",
"std_test_score = cv_results['std_test_score']\n",
"\n",
"print(f\"Смещение: {mean_test_score.mean()}\")\n",
"print(f\"Дисперсия: {std_test_score.mean()}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Градиентный бустинг"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Лучшие гиперпараметры: {'classifier__learning_rate': 0.1, 'classifier__max_depth': 3, 'classifier__n_estimators': 300, 'classifier__subsample': 0.5}\n",
"ROC у метода градиентного спуска = 0.9012421336337971\n",
"Точность = 0.872737556561086\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjcAAAGwCAYAAABVdURTAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABC8klEQVR4nO3dfXzO9eLH8fe12a4N29Da5maam9zlZm7iIIlWVEecTuWUg9TRKTd12lEhLCWcinSinJRUv4rqqJxoDoqDdMisFCY3i7CxsDtsdl3f3x/LVzuGXXNd1/e6rr2ej8f1eHy+332/1/XeF13vvrc2wzAMAQAABIggqwMAAAC4E+UGAAAEFMoNAAAIKJQbAAAQUCg3AAAgoFBuAABAQKHcAACAgFLN6gDe5nQ6dfDgQUVERMhms1kdBwAAVIBhGMrPz1e9evUUFHThfTNVrtwcPHhQ8fHxVscAAACVsH//fjVo0OCCy1S5chMRESGpdONERkZanAYAAFREXl6e4uPjze/xC6ly5ebMoajIyEjKDQAAfqYip5RwQjEAAAgolBsAABBQKDcAACCgUG4AAEBAodwAAICAQrkBAAABhXIDAAACCuUGAAAEFMoNAAAIKJQbAAAQUCwtN//5z3/Ur18/1atXTzabTR9//PFF11m9erU6dOggu92upk2basGCBR7PCQAA/Iel5aawsFDt2rXTnDlzKrT83r17dcstt6hXr15KT0/XX/7yF/3pT3/S8uXLPZwUAAD4C0sfnHnTTTfppptuqvDyc+fOVaNGjTRjxgxJUsuWLbVu3Tq98MIL6tOnj6diAuU6nH9KxSVOq2MAgM8JrRakmIgwyz7fr54KvmHDBiUlJZWZ16dPH/3lL3857zpFRUUqKioyp/Py8jwVDwHktMOpBesz9dl3h1TiNBQeElzm5//de9SiZADg+zo0rKXFI7pb9vl+VW6ysrIUGxtbZl5sbKzy8vJ08uRJhYeHn7POtGnTNHnyZG9FhJ/bkZWnvrPWurSOvRrn5QPAr4UEW/vfRb8qN5Uxbtw4JScnm9N5eXmKj4+3MBF80f6jJ9Tj2S/K/dldnRuqQ8NaCg8tu/cmNjJMVyfU8UY8AIAL/KrcxMXFKTs7u8y87OxsRUZGlrvXRpLsdrvsdrs34sEPFZc41ePZz5WdV1Rmfv1a4fr3I9eqht2v/okAAORn5aZr165atmxZmXkrVqxQ165dLUoEf3fDC2vKFJuklrGaN6SjbDabhakAAJfC0nJTUFCgXbt2mdN79+5Venq66tSpo4YNG2rcuHE6cOCA3nrrLUnSAw88oNmzZ+uxxx7Tvffeq88//1zvv/++li5datWvAD/2RcZh/fjzCXM6Y0pf2asFX2ANAIA/sPSMn6+//lrt27dX+/btJUnJyclq3769Jk2aJEk6dOiQ9u3bZy7fqFEjLV26VCtWrFC7du00Y8YMvfbaa1wGDpdt/vGYhr2xyZz+bnIfig0ABAibYRiG1SG8KS8vT1FRUcrNzVVkZKTVceBlmTmFuu751WXmvXVvZ13b7HJrAgEAKsSV72+uYUWV8c3+4+cUm+E9GlFsACDA+NUJxcCl6D9nvTm+ql6kFo/oxqEoAAhAlBtUCb9+TMJNreP0yh87WpgGAOBJHJZCwDtZ7NCAX+21mX5bWwvTAAA8jT03CGh3zt2gjZllnwMVVT3EojQAAG9gzw0CVkFRyTnF5pOR1j3IDQDgHey5QcC691f3sfkm5UZFhbPHBgCqAvbcICDtPlJQZq8NxQYAqg7KDQJOUYlD189YY06n/qWHhWkAAN5GuUHAmf352eeVtakfpRZx3IkaAKoSyg0CyoL1e/XSr8rNklGcQAwAVQ0nFCMg7D96Qj2e/aLMvGd/31Y2m82iRAAAq7DnBgEhff/xMtPjbmqhOzo1sCYMAMBS7LlBQHjsw28lSS3iIpT6l2stTgMAsBJ7buD3Hnh7s06edkiS6kaFWZwGAGA1yg382pe7cpT6fZY5PXcwD8QEgKqOcgO/dbSwWHe/9l9z+t+PXCt7tWALEwEAfAHlBn6rw9MrzHFKv1ZqFhthYRoAgK+g3MDvNb68hoZ1b2R1DACAj6DcwO/sP3pCCWOXmtMf/LmrhWkAAL6GcgO/k/x+epnpOjVCrQkCAPBJlBv4leISpzZlHjOndzzdl7sQAwDKoNzAr+SePG2O1z7WS2EhXB0FACiLcgO/8t2BXHMcX6e6hUkAAL6KcgO/MmzBJqsjAAB8HOUGfuWyX04eHtY9wdogAACfRbmB3ygoKtHPhcWSpP6J9S1OAwDwVZQb+I2l3x40xy3iuBsxAKB8lBv4jaOFpVdKBdnEVVIAgPOi3MAv5J06rb+l7pAkdW8abXEaAIAvo9zA5x3KPam2T/7bnL7xqjgL0wAAfB3lBj6v67TPzXHnhDoa/JsrLEwDAPB1lBv4tLc3ZJrjGqHBeu/+31gXBgDgF6pZHQA4n9fW7tGUpdvN6a8n3KDgIJ4jBQC4MPbcwCcVFpWUKTbvDf+NwkO5QgoAcHHsuYFP+ve2LHO86P7fqEvjyyxMAwDwJ+y5gU965pe9NmEhQRQbAIBLKDfwOUUlDuUUlD5mIeGyGhanAQD4G8oNfM7/fbXPHL/yx44WJgEA+CPKDXzO059uM8eNotlzAwBwDeUGPmXLvmPmeHTvphYmAQD4K8oNfMrvXv7SHI/sRbkBALiOcgOfkfjU2edHPdqnOU/+BgBUCuUGPuFYYbGOnzhtTo+4romFaQAA/oxyA5/Q/ukV5viHZ26SzcZjFgAAlUO5geUWbjx76XdifC2FBPPXEgBQeXyLwHKT/3X20u+PRnSzMAkAIBBQbmC5OjVCJUkP9GzC4SgAwCWj3MBSye+n68Dxk5Kk3i1iLE4DAAgElBtY5ouMw1qcdsCcbh4bYWEaAECgoNzAMsPe2GSONz5xvaKqh1iYBgAQKCg3sMTJYoc5vqtzQ8VEhFmYBgAQSCg3sMQTH209O76lpYVJAACBhnIDS2zY87Mkqaa9mmraq1mcBgAQSCg38Lqfjp3QodxTkqSPR3a3OA0AINBQbuB11/ztC3PcOLqGhUkAAIGIcgOvOu1wmuPOjeooKIib9gEA3MvycjNnzhwlJCQoLCxMXbp00caNGy+4/KxZs9S8eXOFh4crPj5ejzzyiE6dOuWltLhUa384Yo5fG9rJwiQAgEBlablZtGiRkpOTlZKSorS0NLVr1059+vTR4cOHy13+3Xff1dixY5WSkqLt27fr9ddf16JFizR+/HgvJ0dlFJU4dO+Cr83pGqGcSAwAcD9Ly83MmTM1fPhwDRs2TK1atdLcuXNVvXp1zZ8/v9zlv/zyS3Xv3l133323EhISdOONN+quu+664N6eoqIi5eXllXnBGuP+efby78f7tlAwh6QAAB5gWbkpLi7W5s2blZSUdDZMUJCSkpK0YcOGctfp1q2bNm/ebJaZPXv2aNmyZbr55pvP+znTpk1TVFSU+YqPj3fvLwKXRYWH6IGeja2OAQAIUJYdF8jJyZHD4VBsbGyZ+bGxsdqxY0e569x9993KycnRNddcI8MwVFJSogceeOCCh6XGjRun5ORkczovL4+CY4GDx09q8ZbS50gN7XoFT/8GAHiM5ScUu2L16tWaOnWqXn75ZaWlpWnx4sVaunSpnn766fOuY7fbFRkZWeYF7+s2/XNzfHWjOhYmAQAEOsv23ERHRys4OFjZ2dll5mdnZysuLq7cdSZOnKjBgwfrT3/6kySpTZs2Kiws1P33368nnnhCQUF+1dWqlMsj7DqSX6QWcRHqceXlVscBAAQwy9pAaGioOnbsqFWrVpnznE6nVq1apa5du5a7zokTJ84pMMHBwZIkwzA8FxaX5JP0AzqSXyRJmnlnorVhAAABz9JrcZOTkzV06FB16tRJnTt31qxZs1RYWKhhw4ZJkoYMGaL69etr2rRpkqR+/fpp5syZat++vbp06aJdu3Zp4sSJ6tevn1ly4FtOFJfo4YXp5nR8nXDrwgAAqgRLy83AgQN15MgRTZo0SVlZWUpMTFRqaqp5kvG+ffvK7Km
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Матрица ошибок:\n",
"[[1326 400]\n",
" [ 50 1760]]\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAApIAAAIjCAYAAACwHvu2AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAB2r0lEQVR4nO3dd1gUV9sG8HtpS19ABcQogljAiiWKWCOR2BUTRY1iSYwGK4pKYm8k2BC7sWCNMRZiiUaiURQRFUWNIrEGG2BEBERpu98ffuzrigVWloPs/cs118WeOTPz7BLXx+ecOSNRKBQKEBEREREVkY7oAIiIiIjow8REkoiIiIjUwkSSiIiIiNTCRJKIiIiI1MJEkoiIiIjUwkSSiIiIiNTCRJKIiIiI1MJEkoiIiIjUwkSSiIiIiNTCRJKI3uratWto3749ZDIZJBIJwsLCivX8t2/fhkQiQWhoaLGe90PWpk0btGnTRnQYRETvxESS6ANw48YNfPPNN3B0dIShoSHMzc3h7u6OxYsX49mzZxq9to+PDy5duoQ5c+Zg06ZNaNy4sUavV5IGDhwIiUQCc3Pz136O165dg0QigUQiwfz584t8/vv372P69OmIjY0thmiJiEofPdEBENHb7d+/H1988QWkUikGDBiAOnXqIDs7GydOnIC/vz8uX76M1atXa+Taz549Q1RUFL7//nuMGDFCI9ewt7fHs2fPoK+vr5Hzv4uenh4yMzOxd+9e9OrVS2Xfli1bYGhoiOfPn6t17vv372PGjBmoWrUqGjRoUOjjDh06pNb1iIhKGhNJolLs1q1b8Pb2hr29PY4cOYKKFSsq9/n6+uL69evYv3+/xq7/8OFDAICFhYXGriGRSGBoaKix87+LVCqFu7s7fv755wKJ5NatW9GpUyfs3LmzRGLJzMyEsbExDAwMSuR6RETvi0PbRKVYUFAQMjIysHbtWpUkMp+TkxNGjx6tfJ2bm4tZs2ahWrVqkEqlqFq1Kr777jtkZWWpHFe1alV07twZJ06cwMcffwxDQ0M4Ojpi48aNyj7Tp0+Hvb09AMDf3x8SiQRVq1YF8GJIOP/nl02fPh0SiUSlLTw8HC1atICFhQVMTU1Rs2ZNfPfdd8r9b5ojeeTIEbRs2RImJiawsLBAt27dEBcX99rrXb9+HQMHDoSFhQVkMhkGDRqEzMzMN3+wr+jbty8OHDiA1NRUZduZM2dw7do19O3bt0D/lJQUjB8/HnXr1oWpqSnMzc3RoUMHXLhwQdnn6NGjaNKkCQBg0KBByiHy/PfZpk0b1KlTBzExMWjVqhWMjY2Vn8urcyR9fHxgaGhY4P17enrC0tIS9+/fL/R7JSIqTkwkiUqxvXv3wtHREc2bNy9U/6+++gpTp05Fw4YNsWjRIrRu3RqBgYHw9vYu0Pf69ev4/PPP8emnn2LBggWwtLTEwIEDcfnyZQCAl5cXFi1aBADo06cPNm3ahODg4CLFf/nyZXTu3BlZWVmYOXMmFixYgK5duyIyMvKtx/3555/w9PREcnIypk+fDj8/P5w8eRLu7u64fft2gf69evVCeno6AgMD0atXL4SGhmLGjBmFjtPLywsSiQS7du1Stm3duhW1atVCw4YNC/S/efMmwsLC0LlzZyxcuBD+/v64dOkSWrdurUzqnJ2dMXPmTADA0KFDsWnTJmzatAmtWrVSnufRo0fo0KEDGjRogODgYLRt2/a18S1evBgVKlSAj48P8vLyAACrVq3CoUOHsGTJEtjZ2RX6vRIRFSsFEZVKT548UQBQdOvWrVD9Y2NjFQAUX331lUr7+PHjFQAUR44cUbbZ29srACgiIiKUbcnJyQqpVKoYN26csu3WrVsKAIp58+apnNPHx0dhb29fIIZp06YpXv5aWbRokQKA4uHDh2+MO/8a69evV7Y1aNBAYW1trXj06JGy7cKFCwodHR3FgAEDClxv8ODBKufs0aOHoly5cm+85svvw8TERKFQKBSff/65ol27dgqFQqHIy8tT2NraKmbMmPHaz+D58+eKvLy8Au9DKpUqZs6cqWw7c+ZMgfeWr3Xr1goAipUrV752X+vWrVXa/vjjDwUAxezZsxU3b95UmJqaKrp37/7O90hEpEmsSBKVUmlpaQAAMzOzQvX//fffAQB+fn4q7ePGjQOAAnMpXVxc0LJlS+XrChUqoGbNmrh586baMb8qf27lb7/9BrlcXqhjHjx4gNjYWAwcOBBWVlbK9nr16uHTTz9Vvs+XDRs2TOV1y5Yt8ejRI+VnWBh9+/bF0aNHkZiYiCNHjiAxMfG1w9rAi3mVOjovvj7z8vLw6NEj5bD9uXPnCn1NqVSKQYMGFapv+/bt8c0332DmzJnw8vKCoaEhVq1aVehrERFpAhNJolLK3NwcAJCenl6o/v/++y90dHTg5OSk0m5rawsLCwv8+++/Ku1VqlQpcA5LS0s8fvxYzYgL6t27N9zd3fHVV1/BxsYG3t7e2L59+1uTyvw4a9asWWCfs7Mz/vvvPzx9+lSl/dX3YmlpCQBFei8dO3aEmZkZfvnlF2zZsgVNmjQp8Fnmk8vlWLRoEapXrw6pVIry5cujQoUKuHjxIp48eVLoa1aqVKlIN9bMnz8fVlZWiI2NRUhICKytrQt9LBGRJjCRJCqlzM3NYWdnh7///rtIx716s8ub6OrqvrZdoVCofY38+Xv5jIyMEBERgT///BP9+/fHxYsX0bt3b3z66acF+r6P93kv+aRSKby8vLBhwwbs3r37jdVIAJg7dy78/PzQqlUrbN68GX/88QfCw8NRu3btQldegRefT1GcP38eycnJAIBLly4V6VgiIk1gIklUinXu3Bk3btxAVFTUO/va29tDLpfj2rVrKu1JSUlITU1V3oFdHCwtLVXucM73atUTAHR0dNCuXTssXLgQV65cwZw5c3DkyBH89ddfrz13fpzx8fEF9l29ehXly5eHiYnJ+72BN+jbty/Onz+P9PT0196glG/Hjh1o27Yt1q5dC29vb7Rv3x4eHh4FPpPCJvWF8fTpUwwaNAguLi4YOnQogoKCcObMmWI7PxGROphIEpViEyZMgImJCb766iskJSUV2H/jxg0sXrwYwIuhWQAF7qxeuHAhAKBTp07FFle1atXw5MkTXLx4Udn24MED7N69W6VfSkpKgWPzF+Z+dUmifBUrVkSDBg2wYcMGlcTs77//xqFDh5TvUxPatm2LWbNmYenSpbC1tX1jP11d3QLVzl9//RX37t1TactPeF+XdBfVxIkTkZCQgA0bNmDhwoWoWrUqfHx83vg5EhGVBC5ITlSKVatWDVu3bkXv3r3h7Oys8mSbkydP4tdff8XAgQMBAPXr14ePjw9Wr16N1NRUtG7dGqdPn8aGDRvQvXv3Ny4tow5vb29MnDgRPXr0wKhRo5CZmYkVK1agRo0aKjebzJw5ExEREejUqRPs7e2RnJyM5cuX46OPPkKLFi3eeP558+ahQ4cOcHNzw5AhQ/Ds2TMsWbIEMpkM06dPL7b38SodHR1Mnjz5nf06d+6MmTNnYtCgQWjevDkuXbqELVu2wNHRUaVftWrVYGFhgZUrV8LMzAwmJiZo2rQpHBwcihTXkSNHsHz5ckybNk25HNH69evRpk0bTJkyBUFBQUU6HxFRcWFFkqiU69q1Ky5evIjPP/8cv/32G3x9fTFp0iTcvn0bCxYsQEhIiLLvmjVrMGPGDJw5cwZjxozBkSNHEBAQgG3bthVrTOXKlcPu3bthbGyMCRMmYMOGDQgMDESXLl0KxF6lShWsW7cOvr6+WLZsGVq1aoUjR45AJpO98fweHh44ePAgypUrh6lTp2L+/Plo1qwZIiMji5yEacJ3332HcePG4Y8//sDo0aNx7tw57N+/H5UrV1bpp6+vjw0bNkBXVxfDhg1Dnz59cOzYsSJdKz09HYMHD4arqyu+//57ZXvLli0xevRoLFiwAKdOnSqW90VEVFQSRVFmoxMRERER/T9WJImIiIhILUwkiYiIiEgtTCSJiIiISC1MJImIiIhILUwkiYiIiEgtTCSJiIiISC1MJImIiIhILWX
"text/plain": [
"<Figure size 800x600 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Смещение: 0.8811650848575816\n",
"Дисперсия: 0.008658656436943876\n"
]
}
],
"source": [
"# Конвейер\n",
"pipeline_grad = Pipeline([\n",
" ('preprocessing', preprocessing),\n",
" ('classifier', GradientBoostingClassifier())\n",
"])\n",
"\n",
"# Определение сетки гиперпараметров\n",
"param_grid = {\n",
" 'classifier__n_estimators': [100, 200, 300],\n",
" #Скорость обучения\n",
" 'classifier__learning_rate': [0.1, 0.2],\n",
" #Максимальная глубина дерева\n",
" 'classifier__max_depth': [3, 5, 7],\n",
" 'classifier__subsample': [0.1, 0.5, 1.0],\n",
"}\n",
"\n",
"# Создание объекта GridSearchCV\n",
"grid_search = GridSearchCV(pipeline_grad, param_grid, cv=2, scoring='roc_auc', n_jobs=-1)\n",
"\n",
"# Обучение модели с перебором гиперпараметров\n",
"grid_search.fit(X_train, y_train)\n",
"\n",
"print(\"Лучшие гиперпараметры: \", grid_search.best_params_)\n",
"\n",
"# Лучшая модель\n",
"best_model = grid_search.best_estimator_\n",
"\n",
"# Использование и оценка лучшей модели\n",
"y_pred_proba = best_model.predict_proba(X_test)[:, 1]\n",
"print(f'ROC у метода градиентного спуска = {roc_auc_score(y_test, y_pred_proba)}')\n",
"\n",
"y_pred = best_model.predict(X_test)\n",
"print(f'Точность = {accuracy_score(y_test, y_pred)}')\n",
"\n",
"fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)\n",
"\n",
"# построение ROC кривой\n",
"plt.plot(fpr, tpr)\n",
"plt.ylabel('True Positive Rate')\n",
"plt.xlabel('False Positive Rate')\n",
"plt.show()\n",
"\n",
"# Построение матрицы ошибок\n",
"conf_matrix = confusion_matrix(y_test, y_pred)\n",
"\n",
"# Визуализация матрицы ошибок\n",
"plt.figure(figsize=(8, 6))\n",
"sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', \n",
" xticklabels=['Предсказанный \"безопасный\"', 'Предсказанный \"опасный\"'], \n",
" yticklabels=['Действительно \"безопасный\"', 'Действительно \"опасный\"'])\n",
"plt.title('Confusion Matrix')\n",
"plt.ylabel('Actual')\n",
"plt.xlabel('Predicted')\n",
"plt.show()\n",
"\n",
"# Оценка дисперсии и смещения\n",
"cv_results = grid_search.cv_results_\n",
"mean_test_score = cv_results['mean_test_score']\n",
"std_test_score = cv_results['std_test_score']\n",
"\n",
"print(f\"Смещение: {mean_test_score.mean()}\")\n",
"print(f\"Дисперсия: {std_test_score.mean()}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Вывод**:\n",
"\n",
"Все модели классификации показали хорошие результаты, но лучший показатель точности у случайного леса. При этом все рассмотренные модели немного не дотянули до показателя точности в 90%. Дополнительая настройка гиперпараметров могла бы приблизить значение оценки ещё ближе к 90% "
]
}
],
"metadata": {
"kernelspec": {
"display_name": "aimenv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}