808 lines
215 KiB
Plaintext
Raw Normal View History

2024-11-15 21:25:43 +04:00
{
"cells": [
{
"cell_type": "code",
2024-11-16 09:23:33 +04:00
"execution_count": 2,
2024-11-15 21:25:43 +04:00
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.pipeline import make_pipeline\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.preprocessing import PolynomialFeatures\n",
"from sklearn.metrics import mean_squared_error\n",
"import matplotlib.pyplot as plt\n",
"from scipy import stats\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.tree import DecisionTreeRegressor\n",
"from sklearn.ensemble import GradientBoostingRegressor\n",
"from sklearn import linear_model, tree, neighbors, ensemble, neural_network\n",
"\n",
"df = pd.read_csv(\"..//static//csv//balanced_neo.csv\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## **1-я бизнес-цель (регрессия)**: \n",
"\n",
"Предсказание скорости космического объекта для принятия решения о том, насколько опасным он может быть и стоит ли вести за ним наблюдения"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Целевой признак: скорость космического объекта relative_velocity\n",
"\n",
"Вход: минимальный радиус est_diameter_min, максимальный радиус est_diameter_max, яркость объекта absolute_magnitude, расстояние от Земли miss_distance"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Достижимый уровень качества: предсказания должны иметь погрешность в среднем не более 10000 км/с. Для проверки будет использоваться метрика MAE (средняя абсолютная ошибка)"
]
},
{
"cell_type": "code",
2024-11-16 09:23:33 +04:00
"execution_count": 4,
2024-11-15 21:25:43 +04:00
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.discriminant_analysis import StandardScaler\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.model_selection import GridSearchCV, train_test_split\n",
"from sklearn.metrics import roc_auc_score, confusion_matrix, accuracy_score\n",
"from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n",
"import seaborn as sns\n",
"from sklearn.model_selection import cross_val_predict\n",
"from sklearn.metrics import mean_squared_error\n",
"import numpy as np\n",
"from sklearn import metrics\n",
"import sklearn.preprocessing as preproc\n",
"from sklearn.linear_model import LinearRegression, Ridge\n",
"from sklearn.metrics import mean_absolute_error\n",
"from mlxtend.evaluate import bias_variance_decomp\n",
"from sklearn.neural_network import MLPRegressor\n",
"\n",
"# Загрузка данных\n",
"df = pd.read_csv(\"..//static//csv//balanced_neo.csv\")\n",
"data = df[['est_diameter_min', 'est_diameter_max', 'absolute_magnitude', 'miss_distance', 'relative_velocity']]\n",
"\n",
"X = data.drop('relative_velocity', axis=1)\n",
"y = data['relative_velocity']\n",
"\n",
"# Разделение данных на обучающую и тестовую выборки\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
"\n",
"# Преобразование числовых данных\n",
"#заполнение пустых значений медианой\n",
"num_imputer = SimpleImputer(strategy=\"median\")\n",
"\n",
"preprocessing_num = Pipeline(\n",
" [\n",
" (\"imputer\", num_imputer)\n",
" ]\n",
")\n",
"\n",
"#Категориальных данных нет, поэтому преобразовывать их не надо\n",
"\n",
"\n",
"# Общая предобработка (только числовые данные)\n",
"preprocessing = ColumnTransformer(\n",
" [\n",
" (\"nums\", preprocessing_num, X.columns)\n",
" ]\n",
")\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Линейная регрессия"
]
},
{
"cell_type": "code",
2024-11-16 09:40:09 +04:00
"execution_count": 8,
2024-11-15 21:25:43 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Лучшие гиперпараметры: {'preprocessing': MinMaxScaler()}\n",
"Cредняя абсолютная ошибка (MAE) = 19241.554618019443\n",
2024-11-16 09:23:33 +04:00
"Смещение: -24344.57878426918\n",
2024-11-16 09:40:09 +04:00
"Дисперсия: 219.3206565410472\n",
"R^2 = 0.18832948575910047\n"
2024-11-15 21:25:43 +04:00
]
}
],
"source": [
"pipeline_lin_reg = Pipeline([\n",
" ('preprocessing', preprocessing),\n",
" ('model', LinearRegression())]\n",
")\n",
"\n",
"# Определение сетки гиперпараметров (возможных знач-ий гиперпараметров) для перебора\n",
"param_grid = {\n",
" #как будут масштабироваться признаки\n",
" 'preprocessing': [StandardScaler(), preproc.MinMaxScaler(), preproc.MaxAbsScaler(), None]\n",
"}\n",
"\n",
"# Создание объекта GridSearchCV для поиска лучших гиперпараметров по сетке с максимальным знач-ием \n",
"# отрицательного корня из среднеквадратичной ошибки (отриц., чтобы искался не минимум, а максимум)\n",
"grid_search = GridSearchCV(pipeline_lin_reg, param_grid, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)\n",
"\n",
"# Обучение модели с перебором гиперпараметров\n",
"grid_search.fit(X_train, y_train)\n",
"\n",
"print(\"Лучшие гиперпараметры: \", grid_search.best_params_)\n",
"\n",
"# Лучшая модель лин. регрессии\n",
"best_model = grid_search.best_estimator_\n",
"\n",
"y_pred = best_model.predict(X_test)\n",
"\n",
"print(f'Cредняя абсолютная ошибка (MAE) = {mean_absolute_error(y_test, y_pred)}')\n",
"\n",
"\n",
2024-11-16 09:23:33 +04:00
"# mse, bias, variance = bias_variance_decomp(best_model, X_train.values, y_train.values, X_test.values, y_test.values, loss='mse', num_rounds=200, random_seed=123)\n",
"# print(\"Смещение: \", bias)\n",
"# print(\"Дисперсия: \", variance)\n",
"\n",
"# Оценка дисперсии и смещения\n",
"cv_results = grid_search.cv_results_\n",
"mean_test_score = cv_results['mean_test_score']\n",
"std_test_score = cv_results['std_test_score']\n",
"\n",
"print(f\"Смещение: {mean_test_score.mean()}\")\n",
2024-11-16 09:40:09 +04:00
"print(f\"Дисперсия: {std_test_score.mean()}\")\n",
"\n",
"from sklearn.metrics import r2_score\n",
"\n",
"print(f'R^2 = {r2_score(y_test, y_pred)}')"
2024-11-15 21:25:43 +04:00
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Гребневая регрессия"
]
},
{
"cell_type": "code",
2024-11-16 09:40:09 +04:00
"execution_count": 9,
2024-11-15 21:25:43 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Лучшие гиперпараметры: {'model__alpha': 10.0, 'preprocessing': MinMaxScaler()}\n",
"Cредняя абсолютная ошибка (MAE) = 19239.098934204343\n",
2024-11-16 09:23:33 +04:00
"Смещение: -24500.751070720406\n",
2024-11-16 09:40:09 +04:00
"Дисперсия: 399.3445953588631\n",
"R^2 = 0.18843191913477164\n"
2024-11-15 21:25:43 +04:00
]
}
],
"source": [
"pipeline_ridge = Pipeline([\n",
" ('preprocessing', preprocessing),\n",
" ('model', Ridge())]\n",
")\n",
"\n",
"# Определение сетки гиперпараметров (возможных знач-ий гиперпараметров) для перебора\n",
"param_grid = {\n",
" #как будут масштабироваться признаки\n",
" 'preprocessing': [StandardScaler(), preproc.MinMaxScaler(), preproc.MaxAbsScaler(), None],\n",
" #сила регуляризации\n",
" 'model__alpha': [0, 0.5, 1.0, 1.5, 2.0, 5.0, 10.0] \n",
"}\n",
"\n",
"# Создание объекта GridSearchCV для поиска лучших гиперпараметров по сетке с максимальным знач-ием \n",
"# отрицательного корня из среднеквадратичной ошибки (отриц., чтобы искался не минимум, а максимум)\n",
"grid_search = GridSearchCV(pipeline_ridge, param_grid, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1, verbose=0)\n",
"\n",
"# Обучение модели с перебором гиперпараметров\n",
"grid_search.fit(X_train, y_train)\n",
"\n",
"print(\"Лучшие гиперпараметры: \", grid_search.best_params_)\n",
"\n",
"# Лучшая модель регрессии\n",
"best_model = grid_search.best_estimator_\n",
"\n",
"y_pred = best_model.predict(X_test)\n",
"\n",
"print(f'Cредняя абсолютная ошибка (MAE) = {mean_absolute_error(y_test, y_pred)}')\n",
"\n",
2024-11-16 09:23:33 +04:00
"# mse, bias, variance = bias_variance_decomp(best_model, X_train.values, y_train.values, X_test.values, y_test.values, loss='mse', num_rounds=200, random_seed=123)\n",
"# print(\"Смещение: \", bias)\n",
"# print(\"Дисперсия: \", variance)\n",
"\n",
"cv_results = grid_search.cv_results_\n",
"mean_test_score = cv_results['mean_test_score']\n",
"std_test_score = cv_results['std_test_score']\n",
"\n",
"print(f\"Смещение: {mean_test_score.mean()}\")\n",
2024-11-16 09:40:09 +04:00
"print(f\"Дисперсия: {std_test_score.mean()}\")\n",
"\n",
"print(f'R^2 = {r2_score(y_test, y_pred)}')"
2024-11-15 21:25:43 +04:00
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Гребневая регрессия показала почти такие же результаты, что и линейная регрессия"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Метод градиентного бустинга (набор деревьев решений)"
]
},
{
"cell_type": "code",
2024-11-16 09:40:09 +04:00
"execution_count": 10,
2024-11-15 21:25:43 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-11-16 09:40:09 +04:00
"Лучшие гиперпараметры: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 100, 'preprocessing': None}\n",
"Cредняя абсолютная ошибка (MAE) = 18906.41250012098\n",
"Смещение: -24465.30968285963\n",
"Дисперсия: 186.25822491864383\n",
"R^2 = 0.21038509874388833\n"
2024-11-15 21:25:43 +04:00
]
}
],
"source": [
"# Конвейер\n",
"pipeline_grad = Pipeline([\n",
" ('preprocessing', preprocessing),\n",
" ('model', GradientBoostingRegressor())\n",
"])\n",
"\n",
"# Определение сетки гиперпараметров\n",
"param_grid = {\n",
" 'preprocessing': [StandardScaler(), preproc.MinMaxScaler(), preproc.MaxAbsScaler(), None],\n",
" 'model__n_estimators': [100, 200, 300],\n",
" #Скорость обучения\n",
" 'model__learning_rate': [0.1, 0.2],\n",
" #Максимальная глубина дерева\n",
" 'model__max_depth': [3, 5, 7]\n",
"}\n",
"\n",
"# Создание объекта GridSearchCV\n",
"grid_search = GridSearchCV(pipeline_grad, param_grid, cv=2, scoring='neg_root_mean_squared_error', n_jobs=-1)\n",
"\n",
"# Обучение модели с перебором гиперпараметров\n",
"grid_search.fit(X_train, y_train)\n",
"\n",
"print(\"Лучшие гиперпараметры: \", grid_search.best_params_)\n",
"\n",
"# Лучшая модель случайного леса\n",
"best_model = grid_search.best_estimator_\n",
"\n",
"\n",
"y_pred = best_model.predict(X_test)\n",
"\n",
"\n",
"print(f'Cредняя абсолютная ошибка (MAE) = {mean_absolute_error(y_test, y_pred)}')\n",
"\n",
"\n",
"# Получение предсказаний на кросс-валидации\n",
"y_cv_pred = cross_val_predict(best_model, X_train, y_train, cv=3)\n",
"\n",
2024-11-16 09:23:33 +04:00
"cv_results = grid_search.cv_results_\n",
"mean_test_score = cv_results['mean_test_score']\n",
"std_test_score = cv_results['std_test_score']\n",
2024-11-15 21:25:43 +04:00
"\n",
2024-11-16 09:23:33 +04:00
"print(f\"Смещение: {mean_test_score.mean()}\")\n",
2024-11-16 09:40:09 +04:00
"print(f\"Дисперсия: {std_test_score.mean()}\")\n",
"\n",
"print(f'R^2 = {r2_score(y_test, y_pred)}')"
2024-11-15 21:25:43 +04:00
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Вывод**: \n",
"\n",
2024-11-16 09:40:09 +04:00
"Все 3 модели регрессии не показали необходимого уровня \"погрешности\". Также у всех моделей большое значение смещения, т.е. эти модели для задачи слишком простые. Необходимо использовать более сложные модели. Также возможно, что по доступным в датасете данным нельзя достичь необходимой погрешности.\n",
"\n",
"Также низкое значение метрики R^2 у всех моделей показывает, что эти модели плохо объясняют вариацию целевой переменной.\n",
2024-11-15 21:25:43 +04:00
"\n",
"Из всех моделей градиентный бустинг показал самую низкую \"погрешность\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## **2-я бизнес-цель (классификация):** \n",
"\n",
"Определение опасности космиеского объекта для увеличения безопасности Земли"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Целевой признак: опасность объекта hazardous\n",
"\n",
"Вход: минимальный радиус est_diameter_min, максимальный радиус est_diameter_max, яркость объекта absolute_magnitude, скорость relative_velocity"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Достижимый уровень качества: необходимо, чтобы точность предсказания модели составляла не менее 90%. Для проверки этого будет использована метрика Accuracy"
]
},
{
"cell_type": "code",
2024-11-15 21:26:00 +04:00
"execution_count": 6,
2024-11-15 21:25:43 +04:00
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.discriminant_analysis import StandardScaler\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.preprocessing import OneHotEncoder\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.model_selection import GridSearchCV, train_test_split\n",
"from sklearn.metrics import roc_auc_score, confusion_matrix, accuracy_score\n",
"from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n",
"import seaborn as sns\n",
"from sklearn.model_selection import cross_val_predict\n",
"from sklearn.metrics import mean_squared_error\n",
"import numpy as np\n",
"from sklearn import metrics\n",
"# Загрузка данных\n",
"df = pd.read_csv(\"..//static//csv//balanced_neo.csv\")\n",
"data = df[['est_diameter_min', 'est_diameter_max', 'absolute_magnitude', 'relative_velocity', 'hazardous']]\n",
"\n",
"X = data.drop('hazardous', axis=1)\n",
"y = data['hazardous']\n",
"\n",
"# Разделение данных на обучающую и тестовую выборки\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
"\n",
"# Преобразование числовых данных\n",
"num_imputer = SimpleImputer(strategy=\"median\")\n",
"num_scaler = StandardScaler()\n",
"preprocessing_num = Pipeline(\n",
" [\n",
" (\"imputer\", num_imputer),\n",
" (\"scaler\", num_scaler),\n",
" ]\n",
")\n",
"\n",
"# Общая предобработка (только числовые данные)\n",
"preprocessing = ColumnTransformer(\n",
" [\n",
" (\"nums\", preprocessing_num, X.columns),\n",
" ]\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Логистическая регрессия"
]
},
{
"cell_type": "code",
2024-11-15 21:26:00 +04:00
"execution_count": 7,
2024-11-15 21:25:43 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Лучшие гиперпараметры: {'classifier__C': 0.1, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear'}\n",
2024-11-15 21:26:00 +04:00
"ROC у логистической регрессии = 0.8670873798838691\n",
2024-11-15 21:25:43 +04:00
"Точность = 0.8591628959276018\n"
]
},
{
"data": {
2024-11-15 21:26:00 +04:00
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjcAAAGwCAYAAABVdURTAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA6aElEQVR4nO3df5zNdf7///uc4cwvZrCzZgZTQz+UJUJ8RspbpqZfYttKsUwqbcVkzaoQhpKxFWlRlpLqXUtalUW8M8UiuwrTL2IxIsxoVs0wmOGc5/cPX6eZMaM54/x8ndv1cjmXi/Oa1+ucx3n5ce6er8fr+QwzxhgBAABYhM3fBQAAAHgS4QYAAFgK4QYAAFgK4QYAAFgK4QYAAFgK4QYAAFgK4QYAAFhKPX8X4GtOp1MHDhxQw4YNFRYW5u9yAABALRhjdOTIETVr1kw227nHZkIu3Bw4cEDJycn+LgMAANTBvn371KJFi3PuE3LhpmHDhpJOn5zY2Fg/VwMAAGqjpKREycnJru/xcwm5cHPmUlRsbCzhBgCAIFOblhIaigEAgKUQbgAAgKUQbgAAgKUQbgAAgKUQbgAAgKUQbgAAgKUQbgAAgKUQbgAAgKUQbgAAgKUQbgAAgKX4Ndz885//VO/evdWsWTOFhYXp/fff/8VjVq9erY4dOyoiIkIXX3yx5s+f7/U6AQBA8PBruCktLVX79u01a9asWu2fn5+vW265RT179lReXp7++Mc/6oEHHtDKlSu9XCkAAAgWfl0486abbtJNN91U6/1nz56tli1baurUqZKkyy+/XOvWrdMLL7yg9PR0b5UJAIDbjDE6ftLh7zL8Jqp+eK0WufSGoFoVfMOGDUpLS6u0LT09XX/84x9rPKasrExlZWWu5yUlJd4qDwhoof4PLeBLxkh3zt6grQdD9ztn61Ppirb7J2YEVbgpKChQQkJCpW0JCQkqKSnR8ePHFRUVddYxOTk5mjhxoq9KRAgJprDAP7QAQklQhZu6GD16tLKyslzPS0pKlJyc7MeKEKwqhhnCAoDaaJMUq0UPpcpPV2f8Kqp+uN/eO6jCTWJiogoLCyttKywsVGxsbLWjNpIUERGhiIgIX5QHC6k6KmOVMBPK/9AC/uDPvpNQFlThJjU1VcuXL6+07aOPPlJqaqqfKoLVGGN0rNxR6yATbGGBf2gBhAK/hpujR49q586druf5+fnKy8tTkyZNdMEFF2j06NHav3+/3njjDUnSQw89pJkzZ+rxxx/Xfffdp48//ljvvPOOli1b5q+PAAtxOo1unbHunKGmapghLABA4PFruPn888/Vs2dP1/MzvTEZGRmaP3++Dh48qL1797p+3rJlSy1btkwjRozQiy++qBYtWuiVV17hNnCcN6fTqNe0NcovKnVtq25UhjADAIEvzBhj/F2EL5WUlCguLk7FxcWKjY31dznwszOXoW6dsc4VbFrGx2hpZndF2wkyABAo3Pn+DqqeG8BTauqtaRkfo9ysHrLZCDUAEKwINwg5NfXWtEmK1dLM7gQbAAhyhBuEFGPODjZnemu4DAUA1kC4QUg5Vu5wBRt6awDAmgg3CBnGGN05e4Pr+dLM7oqJ4K8AAFiNzd8FAL5ScdSmTVKsou3+mxocAOA9hBuEhKqjNqfnr+FSFABYEeEGIeH4SUZtACBUEG4Qchi1AQBrI9wg5JBrAMDaCDcAAMBSCDcAAMBSCDcICaG1PCwAhDbCDSyv6m3gAABrI9zA8qpO3hdVn9vAAcDKCDewtDMrgJ/BbeAAYH2EG1iW02nUa9oa5ReVSmLyPgAIFYQbWJIxp0dszgSbMyuAM2oDANZHuIElVVxuoWV8jHKzeshmI9gAQCgg3MCSKt76vTSzO8EGAEII4QaWU/XWb65EAUBoIdzAcqquAM6t3wAQWgg3sDRu/QaA0EO4gaWRawAg9BBuYDmsIwUAoY1wA0upOiMxACD0EG5gGVUn7qOZGABCE+EGllF14j5mJAaA0ES4gSUxcR8AhC7CDSyjYiMxAzYAELoIN7CEqrMSAwBCF+EGlsCsxACAMwg3sBxmJQaA0Ea4gSXQbwMAOINwg6BHvw0AoCLCDYIe/TYAgIoIN7AU+m0AAIQbBD36bQAAFRFuENTotwEAVEW4QVCj3wYAUBXhBpZBvw0AQCLcwELINQAAiXCDIFexmRgAAIlwgyDmdBrdOmOdv8sAAAQYwg2CktNp1GvaGuUXlUqimRgA8DPCDYKOMadHbM4Em5bxMVqa2Z1mYgCAJMINgtCx8p9v/24ZH6PcrB6y2Qg2AIDTCDcIKlUn7Vua2Z1gAwCohHCDoFJ10r5oO302AIDKCDcIWkzaBwCoDuEGQYVFMgEAv4Rwg6DBIpkAgNog3CBosEgmAKA2CDcISvTbAABqQrhBUCLXAABqQrgBAACWQrgBAACW4vdwM2vWLKWkpCgyMlJdu3bVxo0bz7n/9OnT1bp1a0VFRSk5OVkjRozQiRMnfFQtAAAIdH4NNwsXLlRWVpays7O1efNmtW/fXunp6Tp06FC1+7/99tsaNWqUsrOztW3bNr366qtauHChxowZ4+PK4Q8V57gBAKAmfg0306ZN05AhQzR48GC1adNGs2fPVnR0tObNm1ft/p9++qmuvvpq9e/fXykpKbrhhht0zz33nHO0p6ysTCUlJZUeCD7McQMAqC2/hZvy8nJt2rRJaWlpPxdjsyktLU0bNlT/JdatWzdt2rTJFWZ2796t5cuX6+abb67xfXJychQXF+d6JCcne/aDwCeY4wYAUFv1/PXGRUVFcjgcSkhIqLQ9ISFB3377bbXH9O/fX0VFRerevbuMMTp16pQeeuihc16WGj16tLKyslzPS0pKCDhBjjluAADn4veGYnesXr1akydP1ksvvaTNmzdr8eLFWrZsmZ5++ukaj4mIiFBsbGylB4IPa0oBAGrLbyM38fHxCg8PV2FhYaXthYWFSkxMrPaYcePGaeDAgXrggQckSe3atVNpaakefPBBPfnkk7LZgiqroZbotwEAuMNvacBut6tTp07Kzc11bXM6ncrNzVVqamq1xxw7duysABMefrr3wnArjWXRbwMAcIffRm4kKSsrSxkZGercubO6dOmi6dOnq7S0VIMHD5YkDRo0SM2bN1dOTo4kqXfv3po2bZquvPJKde3aVTt37tS4cePUu3dvV8iBtdFvAwD4JX4NN/369dMPP/yg8ePHq6CgQB06dNCKFStcTcZ79+6tNFIzduxYhYWFaezYsdq/f79+/etfq3fv3nrmmWf89RHgA/TbAADcEWZC7HpOSUmJ4uLiVFxcTHNxEDDG6Ja/rHNdltr6VLqi7X7N5AAAP3Dn+5sOXAQ0+m0AAO4i3CBo0G8DAKgNwg0CGv02AAB3EW4QsJjfBgBQF4QbBKxj5fTbAADcR7hBQHI6jW6dsc71nH4bAEBtEW4QcIw5HWzyi0olnR61ibYzagMAqB3CDQJOxdu/W8bHaGlmd0ZtAAC1RrhBQFua2V02G8EGAFB7hBsEHG7/BgCcD8INAgq3fwMAzhfhBgGF5RYAAOeLcIOAxe3fAIC6INwgoNBvAwA4X4QbBAz6bQAAnkC4QcBguQUAgCcQbhAQqo7a0G8DAKgrwg0CQtVRG5ZbAADUFeEGfseoDQDAkwg38Luqc9swagMAOB+EGwQURm0AAOeLcIOAQq4BAJwvwg0AALAUwg38ruKsxAAAnC/CDfyKWYkBAJ5GuIFfsQo4AMDTCDcIGNwpBQDwBMINAga5BgDgCYQbAABgKYQbAABgKYQbAABgKYQb+BVz3AAAPI1wA79hjhsAgDecV7g5ceKEp+pACDpWzhw3AADPczvcOJ1OPf3002revLkaNGig3bt3S5LGjRunV1991eMFwpqqjtowxw0AwFPcDjeTJk3S/Pnz9eyzz8put7u2t23bVq+88opHi4N1VZ2ZONrOqA0AwDPcDjdvvPGG5syZowEDBig8/OcvpPbt2+vbb7/1aHEIDYzaAAA8ye1ws3//fl188cVnbXc6nTp58qRHikJoIdcAADzJ7XDTpk0brV279qzt777
2024-11-15 21:25:43 +04:00
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAApIAAAIjCAYAAACwHvu2AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAB2x0lEQVR4nO3deVxU5fv/8dcgsggCoiJiLrjjvpXhblLmvn7KMsWlzTRzXyrNNQpzSS3NTFHTzNxSK9M0JY3ctxRNUyMX1EREXJBlfn/4Y75OqMHIcBDez8/jPD7Mfe65z3UGHa+uc5/7mMxmsxkRERERkQxyMDoAEREREXk0KZEUEREREZsokRQRERERmyiRFBERERGbKJEUEREREZsokRQRERERmyiRFBERERGbKJEUEREREZsokRQRERERmyiRFJEHOn78OM888wyenp6YTCZWr16dqeOfPn0ak8lEWFhYpo77KGvSpAlNmjQxOgwRkf+kRFLkEfDnn3/y2muvUbp0aVxcXPDw8KB+/fp8/PHH3Lx5067HDg4O5tChQ0ycOJFFixZRp04dux4vK/Xo0QOTyYSHh8c9P8fjx49jMpkwmUx89NFHGR7/3LlzjBkzhv3792dCtCIi2Y+j0QGIyIN99913/O9//8PZ2Znu3btTpUoVbt++zbZt2xg6dCiHDx9mzpw5djn2zZs3iYiI4J133qFfv352OUbJkiW5efMmefPmtcv4/8XR0ZEbN26wdu1annvuOat9ixcvxsXFhVu3btk09rlz5xg7diylSpWiRo0a6X7fhg0bbDqeiEhWUyIpko2dOnWKLl26ULJkSTZv3kzRokUt+/r27cuJEyf47rvv7Hb8S5cuAeDl5WW3Y5hMJlxcXOw2/n9xdnamfv36fPXVV2kSySVLltCqVStWrFiRJbHcuHGDfPny4eTklCXHExF5WLq0LZKNhYaGEh8fzxdffGGVRKYqW7Ysb731luV1UlIS48ePp0yZMjg7O1OqVCnefvttEhISrN5XqlQpWrduzbZt23jiiSdwcXGhdOnSLFy40NJnzJgxlCxZEoChQ4diMpkoVaoUcOeScOrPdxszZgwmk8mqbePGjTRo0AAvLy/c3d2pUKECb7/9tmX//eZIbt68mYYNG+Lm5oaXlxft2rUjMjLynsc7ceIEPXr0wMvLC09PT3r27MmNGzfu/8H+y4svvsgPP/xAbGyspW3Xrl0cP36cF198MU3/mJgYhgwZQtWqVXF3d8fDw4MWLVpw4MABS58tW7bw+OOPA9CzZ0/LJfLU82zSpAlVqlRhz549NGrUiHz58lk+l3/PkQwODsbFxSXN+Tdv3pwCBQpw7ty5dJ+riEhmUiIpko2tXbuW0qVLU69evXT1f/nllxk9ejS1atVi6tSpNG7cmJCQELp06ZKm74kTJ+jcuTNPP/00kydPpkCBAvTo0YPDhw8D0LFjR6ZOnQrACy+8wKJFi5g2bVqG4j98+DCtW7cmISGBcePGMXnyZNq2bcv27dsf+L6ffvqJ5s2bc/HiRcaMGcOgQYP49ddfqV+/PqdPn07T/7nnnuPatWuEhITw3HPPERYWxtixY9MdZ8eOHTGZTKxcudLStmTJEipWrEitWrXS9D958iSrV6+mdevWTJkyhaFDh3Lo0CEaN25sSeoCAgIYN24cAK+++iqLFi1i0aJFNGrUyDLO5cuXadGiBTVq1GDatGk0bdr0nvF9/PHHFC5cmODgYJKTkwH47LPP2LBhAzNmzMDPzy/d5yoikqnMIpItXb161QyY27Vrl67++/fvNwPml19+2ap9yJAhZsC8efNmS1vJkiXNgDk8PNzSdvHiRbOzs7N58ODBlrZTp06ZAfOkSZOsxgwODjaXLFkyTQzvvfee+e6vlalTp5oB86VLl+4bd+ox5s+fb2mrUaOG2cfHx3z58mVL24EDB8wODg7m7t27pzler169rMbs0KGDuWDBgvc95t3n4ebmZjabzebOnTubmzVrZjabzebk5GSzr6+veezYsff8DG7dumVOTk5Ocx7Ozs7mcePGWdp27dqV5txSNW7c2AyYZ8+efc99jRs3tmr78ccfzYB5woQJ5pMnT5rd3d3N7du3/89zFBGxJ1UkRbKpuLg4APLnz5+u/t9//z0AgwYNsmofPHgwQJq5lJUqVaJhw4aW14ULF6ZChQqcPHnS5pj/LXVu5bfffktKSkq63nP+/Hn2799Pjx498Pb2trRXq1aNp59+2nKed3v99detXjds2JDLly9bPsP0ePHFF9myZQvR0dFs3ryZ6Ojoe17WhjvzKh0c7nx9Jicnc/nyZctl+71796b7mM7OzvTs2TNdfZ955hlee+01xo0bR8eOHXFxceGzzz5L97FEROxBiaRINuXh4QHAtWvX0tX/r7/+wsHBgbJly1q1+/r64uXlxV9//WXVXqJEiTRjFChQgCtXrtgYcVrPP/889evX5+WXX6ZIkSJ06dKFZcuWPTCpTI2zQoUKafYFBATwzz//cP36dav2f59LgQIFADJ0Li1btiR//vx8/fXXLF68mMcffzzNZ5kqJSWFqVOnUq5cOZydnSlUqBCFCxfm4MGDXL16Nd3HLFasWIZurPnoo4/w9vZm//79TJ8+HR8fn3S/V0TEHpRIimRTHh4e+Pn58fvvv2foff++2eV+8uTJc892s9ls8zFS5++lcnV1JTw8nJ9++olu3bpx8OBBnn/+eZ5++uk0fR/Gw5xLKmdnZzp27MiCBQtYtWrVfauRAO+//z6DBg2iUaNGfPnll/z4449s3LiRypUrp7vyCnc+n4zYt28fFy9eBODQoUMZeq+IiD0okRTJxlq3bs2ff/5JRETEf/YtWbIkKSkpHD9+3Kr9woULxMbGWu7AzgwFChSwusM51b+rngAODg40a9aMKVOmcOTIESZOnMjmzZv5+eef7zl2apzHjh1Ls+/o0aMUKlQINze3hzuB+3jxxRfZt28f165du+cNSqmWL19O06ZN+eKLL+jSpQvPPPMMQUFBaT6T9Cb16XH9+nV69uxJpUqVePXVVwkNDWXXrl2ZNr6IiC2USIpkY8OGDcPNzY2XX36ZCxcupNn/559/8vHHHwN3Ls0Cae6snjJlCgCtWrXKtLjKlCnD1atXOXjwoKXt/PnzrFq1yqpfTExMmvemLsz97yWJUhUtWpQaNWqwYMECq8Ts999/Z8OGDZbztIemTZsyfvx4Zs6cia+v73375cmTJ02185tvvuHs2bNWbakJ772S7owaPnw4UVFRLFiwgClTplCqVCmCg4Pv+zmKiGQFLUguko2VKVOGJUuW8PzzzxMQEGD1ZJtff/2Vb775hh49egBQvXp1goODmTNnDrGxsTRu3JidO3eyYMEC2rdvf9+lZWzRpUsXhg8fTocOHejfvz83btxg1qxZlC9f3upmk3HjxhEeHk6rVq0oWbIkFy9e5NNPP+Wxxx6jQYMG9x1/0qRJtGjRgsDAQHr37s3NmzeZMWMGnp6ejBkzJtPO498cHBx49913/7Nf69atGTduHD179qRevXocOnSIxYsXU7p0aat+ZcqUwcvLi9mzZ5M/f37c3NyoW7cu/v7+GYpr8+bNfPrpp7z33nuW5Yjmz59PkyZNGDVqFKGhoRkaT0Qks6giKZLNtW3bloMHD9K5c2e+/fZb+vbty4gRIzh9+jSTJ09m+vTplr5z585l7Nix7Nq1iwEDBrB582ZGjhzJ0qVLMzWmggULsmrVKvLly8ewYcNYsGABISEhtGnTJk3sJUqUYN68efTt25dPPvmERo0asXnzZjw9Pe87flBQEOvXr6dgwYKMHj2ajz76iCeffJLt27dnOAmzh7fffpvBgwfz448/8tZbb7F3716+++47ihcvbtUvb968LFiwgDx58vD666/zwgsvsHXr1gwd69q1a/Tq1YuaNWvyzjvvWNobNmzIW2+9xeTJk/ntt98y5bxERDLKZM7IbHQRERERkf9PFUkRERERsYkSSRERERGxiRJJEREREbGJEkkRERERsYkSSRERERGxiRJJEREREbGJEkkRERERsUmOfLJ
"text/plain": [
"<Figure size 800x600 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-11-15 21:26:00 +04:00
"Смещение: 0.8529941124698746\n",
"Дисперсия: 0.0065558753718589465\n"
2024-11-15 21:25:43 +04:00
]
}
],
"source": [
"# Конвейер для логистической регрессии\n",
"pipeline_logreg = Pipeline([\n",
" ('preprocessing', preprocessing),\n",
" ('classifier', LogisticRegression())\n",
"])\n",
"\n",
"# Определение сетки гиперпараметров (возможных знач-ий гиперпараметров) для перебора\n",
"param_grid = {\n",
" # Параметр регуляризации (сила регуляризации), чем меньше, тем сильнее регуляризация\n",
" 'classifier__C': [0.1, 0.5, 1],\n",
" # Тип регуляризации (ф-ия штрафов)\n",
" 'classifier__penalty': ['l1', 'l2'],\n",
" # Решатель (сам алгоритм?)\n",
" 'classifier__solver': ['liblinear', 'saga']\n",
"}\n",
"\n",
"# Создание объекта GridSearchCV для поиска лучших гиперпараметров по сетке с максимальным знач-ием ROC-кривой\n",
"grid_search = GridSearchCV(pipeline_logreg, param_grid, cv=5, scoring='accuracy', n_jobs=-1)\n",
"\n",
"# Обучение модели с перебором гиперпараметров\n",
"grid_search.fit(X_train, y_train)\n",
"\n",
"print(\"Лучшие гиперпараметры: \", grid_search.best_params_)\n",
"\n",
"# Лучшая модель логистической регрессии\n",
"best_model = grid_search.best_estimator_\n",
"\n",
"# Использование и оценка лучшей логистической модели\n",
"y_pred_proba = best_model.predict_proba(X_test)[:, 1]\n",
"print(f'ROC у логистической регрессии = {roc_auc_score(y_test, y_pred_proba)}')\n",
"\n",
"y_pred = best_model.predict(X_test)\n",
"print(f'Точность = {accuracy_score(y_test, y_pred)}')\n",
"\n",
"fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)\n",
"\n",
"# построение ROC кривой\n",
"plt.plot(fpr, tpr)\n",
"plt.ylabel('True Positive Rate')\n",
"plt.xlabel('False Positive Rate')\n",
"plt.show()\n",
"\n",
"# Построение матрицы ошибок\n",
"conf_matrix = confusion_matrix(y_test, y_pred)\n",
"\n",
"# Визуализация матрицы ошибок\n",
"plt.figure(figsize=(8, 6))\n",
"sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', \n",
" xticklabels=['Предсказанный \"безопасный\"', 'Предсказанный \"опасный\"'], \n",
" yticklabels=['Действительно \"безопасный\"', 'Действительно \"опасный\"'])\n",
"plt.title('Confusion Matrix')\n",
"plt.ylabel('Actual')\n",
"plt.xlabel('Predicted')\n",
"plt.show()\n",
"\n",
"# Оценка дисперсии и смещения\n",
"cv_results = grid_search.cv_results_\n",
"mean_test_score = cv_results['mean_test_score']\n",
"std_test_score = cv_results['std_test_score']\n",
"\n",
"print(f\"Смещение: {mean_test_score.mean()}\")\n",
"print(f\"Дисперсия: {std_test_score.mean()}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Метод случаного леса (набор деревьев решений)"
]
},
{
"cell_type": "code",
2024-11-16 09:40:09 +04:00
"execution_count": null,
2024-11-15 21:25:43 +04:00
"metadata": {},
"outputs": [
2024-11-15 21:26:00 +04:00
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\AI labs\\aimenv\\Lib\\site-packages\\numpy\\ma\\core.py:2881: RuntimeWarning: invalid value encountered in cast\n",
" _data = np.array(data, dtype=dtype, copy=copy,\n"
]
},
2024-11-15 21:25:43 +04:00
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-11-15 21:26:00 +04:00
"Лучшие гиперпараметры: {'classifier__max_depth': 10, 'classifier__min_samples_leaf': 4, 'classifier__n_estimators': 200}\n",
"ROC у метода случайного леса = 0.9009594886141752\n",
"Точность = 0.8721719457013575\n"
2024-11-15 21:25:43 +04:00
]
},
{
"data": {
2024-11-15 21:26:00 +04:00
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjcAAAGwCAYAAABVdURTAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABDcklEQVR4nO3de5zM9eLH8ffsbXaxu2jbXdZq3S+RaxwkB1tUR9Q5pfghlc7JJYduCEsJp4t0SjmUpFORrk7EQQjpKGwREVauu6zLrt1ld818f3/IV5vFzpqZ78zs6/l4zKPP97vf78x7vjbz9p3vxWYYhiEAAIAAEWR1AAAAAHei3AAAgIBCuQEAAAGFcgMAAAIK5QYAAAQUyg0AAAgolBsAABBQQqwO4G1Op1MHDx5UZGSkbDab1XEAAEAJGIahkydPqmrVqgoKuvS+mTJXbg4ePKjExESrYwAAgFLYt2+fqlWrdsllyly5iYyMlHR240RFRVmcBgAAlER2drYSExPNz/FLKXPl5txXUVFRUZQbAAD8TEkOKeGAYgAAEFAoNwAAIKBQbgAAQECh3AAAgIBCuQEAAAGFcgMAAAIK5QYAAAQUyg0AAAgolBsAABBQKDcAACCgWFpuvvrqK3Xr1k1Vq1aVzWbTp59+etl1Vq5cqebNm8tut6t27dqaPXu2x3MCAAD/YWm5yc3NVZMmTTRt2rQSLZ+WlqbbbrtNHTt2VGpqqv7+97/rwQcf1JIlSzycFAAA+AtLb5x5yy236JZbbinx8tOnT1eNGjX04osvSpIaNGigNWvW6KWXXlKXLl08FRNwmWEYOpFXqNyCM1ZHAQCvCwsJUmxkuGWv71d3BV+3bp2Sk5OLzOvSpYv+/ve/X3Sd/Px85efnm9PZ2dmeiocAc7rQoR0ZJ7Vp7wl998txnTxdqF1HcrTv2ClFhV/6f53ThU4VOJxeSgoAvqV59Yr6eGA7y17fr8pNenq64uLiisyLi4tTdna2Tp06pYiIiAvWmTRpksaPH++tiAgAb6zerTfXpOlQ1umLLpN9umR7ZMJCgmRzVzAA8BOhwdaer+RX5aY0Ro4cqeHDh5vT2dnZSkxMtDARfNkvR3M1YeG2IvPKhwUrt8ChNjWvUp24Cqp1dQU1rBqlq8qHXfR5wkKCdHWkXfaQYE9HBgD8jl+Vm/j4eGVkZBSZl5GRoaioqGL32kiS3W6X3W73Rjz4McMwVGPkoiLzHu9ST/e2qq7KlygxAADf41fXuWnTpo2WL19eZN7SpUvVpk0bixIhEKzbdfSCYtO6RmUN6libYgMAfsjSPTc5OTnauXOnOZ2WlqbU1FRVrlxZ1atX18iRI3XgwAHNmTNHkvS3v/1Nr776qp544gndf//9+vLLL/XBBx9o4cKFVr0F+LkpS3fon8t/LjJvz+TbLEoDAHAHS8vNd999p44dO5rT546N6devn2bPnq1Dhw5p79695s9r1KihhQsXatiwYXr55ZdVrVo1vfHGG5wGDpedyCvQuAU/6tPUg+a8f97bTLc3qWphKgCAO9gMwzCsDuFN2dnZio6OVlZWlqKioqyOAwsUOpyq89QXReYtG95BtWMrWJQIAHA5rnx++9UxN4A7/H1uapHphY/cQLEBgADiV2dLAVfqiQ+/18LNh8xpjq8BgMBDuUGZ8P2+E3rg7W+VmVNgzvvoYc6yA4BARLlBmTB07qYixWb9qM6KjbLuvicAAM+h3CDgnSpwaM/RPElSk8SK+uhvbRRi8aXBAQCew9/wCGhnHE41GLvYnP575zoUGwAIcPwtj4B278xvikzfUCfGoiQAAG/haykEpLyCM2o4dkmReWmTbpXNxj26ASDQsecGAen3xWbRI+0pNgBQRrDnBgHns9QDRabZYwMAZQt7bhBw3v3m/P3Itk/oSrEBgDKGcoOA8vR/tmr9nmOSpOE31ZU9JNjiRAAAb6PcIGDMWpOmWWvTzOlO9WMtTAMAsArH3CAgnC506OnPt5rTXz3eUdWvKmdhIgCAVdhzA79nGIaSp6wyp+c99AeKDQCUYZQb+L0h72/S/uOnzOnWNa+yMA0AwGqUG/i9z384ZI53TLjFwiQAAF9AuYFfu+k3X0d9OqidwkL4lQaAso4DiuGXHE5DtUYtKjKvaWJFa8IAAHwK/8yFX7rppfN7bKLCQ/TDuJstTAMA8CXsuYHf+c/3B7X7SK45/e3oZC7WBwAwUW7gV2atSStyPZst47tQbAAARfC1FPzKb4vNK/c2UwU7/RwAUBTlBn5j//E8czzilvrq1qSqhWkAAL6KcgO/cDy3QDf8Y4U53b9dknVhAAA+jXIDv/D55vMX6qsdW4HjbAAAF0W5gc8zDENjPt1iTn8xtL2FaQAAvo5yA5/Xc8Y35vjeVokKDebXFgBwcXxKwOetTztmjsff3sjCJAAAf0C5gU9zOg1zPO+hP3DvKADAZfFJAb9RNy7S6ggAAD9AuYFPO5h1yuoIAAA/Q7mBT9v1m3tIVSwXamESAIC/oNzAp324Yb+ks9e2sdlsFqcBAPgDyg18ltNp6D/fH5QknTxdaHEaAIC/oNzAZw2dl2qOJ97R2LogAAC/QrmBTzp44pS510aSOjeIszANAMCfUG7gc3YezlHbyV+a0y/c1cTCNAAAf0O5gc95c81uc3xD7Rj1aFrVwjQAAH8TYnUA4PdS92VJkm5uGKcZfVtanAYA4G/YcwOfs+1QtiSpdc2rLE4CAPBHlBv4rCbVoq2OAADwQ5Qb+JT73lpvjqtVKmdhEgCAv6LcwKes3H7EHMdHh1uYBADgryg38BnHcgvM8edDbrAwCQDAn1Fu4DOaP7PUHFe/iq+kAAClQ7mBTzjjcJrj1jUqKyqcO4ADAEqHcgOfcMZpmOMZfbi2DQCg9Cg38AnTVuw0x/ZQfi0BAKXHpwh8witfni834aHBFiYBAPg7yg0st/donjke1LGWhUkAAIGAcgPLbTmYZY7/2oFyAwC4MpQbWM7x68HELa+pxFlSAIArRrmB5Ya8v0mSFBRkszgJACAQUG5gqRXbD5vjhIoRFiYBAAQKyg0sNXXZz+b4H3++zsIkAIBAQbmBpb7fd0KS9EjnOgoL4dcRAHDlLP80mTZtmpKSkhQeHq7WrVtr/fr1l1x+6tSpqlevniIiIpSYmKhhw4bp9OnTXkoLd7pr+tfmuEPdGAuTAAACiaXlZt68eRo+fLhSUlK0ceNGNWnSRF26dNHhw4eLXf69997TiBEjlJKSom3btunNN9/UvHnzNGrUKC8nx5XafSRH3+45bk63uKayhWkAAIHE0nIzZcoUDRgwQP3791fDhg01ffp0lStXTrNmzSp2+a+//lrt2rVTr169lJSUpJtvvln33nvvJff25OfnKzs7u8gD1jpyMl+dXlxlTi8b3sHCNACAQGNZuSkoKNCGDRuUnJx8PkxQkJKTk7Vu3bpi12nbtq02bNhglpndu3dr0aJFuvXWWy/6OpMmTVJ0dLT5SExMdO8bgcuuf3aZOX6m+7WqHVvBwjQAgEBjWbnJzMyUw+FQXFxckflxcXFKT08vdp1evXrp6aef1g033KDQ0FDVqlVLf/zjHy/5tdTIkSOVlZVlPvbt2+fW94GSy8zJV9KIheZ00lXl1KdNknWBAAAByfIDil2xcuVKTZw4Ua+99po2btyojz/+WAsXLtQzzzxz0XXsdruioqKKPGCNh+Z8V2R6+aN/tCYIACCghVj1wjExMQoODlZGRkaR+RkZGYqPjy92nTFjxqhPnz568MEHJUmNGzdWbm6uHnroIT311FMKCvKrrlbmbNx7whynTbpVNhtXJAYAuJ9lbSAsLEwtWrTQ8uXLzXlOp1PLly9XmzZtil0nLy/vggITHBwsSTIMw3NhccWO5xaY4zn3t6LYAAA8xrI9N5I0fPhw9evXTy1btlSrVq00depU5ebmqn///pKkvn37KiEhQZMmTZIkdevWTVOmTFGzZs3UunVr7dy5U2PGjFG3bt3MkgPf1OyZpeb4DzWvsjAJACDQWVpuevbsqSNHjmjs2LFKT09X06ZNtXjxYvMg47179xbZUzN
2024-11-15 21:25:43 +04:00
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
2024-11-15 21:26:00 +04:00
"image/png": "iVBORw0KGgoAAAANSUhEUgAAApIAAAIjCAYAAACwHvu2AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAB1zUlEQVR4nO3dd1gU1/s28HtpS28qIHwVQVDB3mIQa0SxF0wUNYolGo3Gjkqixk6CDbEbC2g0xliIJRaCUSwEFUWNIrGGWBAjAiJSd98/fJmfGyywshxk70+uuS72zJkzzy5mfXzmzBmZUqlUgoiIiIiomHREB0BERERE7ycmkkRERESkFiaSRERERKQWJpJEREREpBYmkkRERESkFiaSRERERKQWJpJEREREpBYmkkRERESkFiaSRERERKQWJpJE9EbXr19Hhw4dYGFhAZlMhvDw8BId/86dO5DJZAgNDS3Rcd9nbdq0QZs2bUSHQUT0Vkwkid4DN2/exOeffw5nZ2cYGhrC3Nwcnp6eWLZsGZ4/f67Rc/v5+eHy5cuYP38+tmzZgiZNmmj0fKVp8ODBkMlkMDc3f+XneP36dchkMshkMixatKjY49+/fx+zZs1CXFxcCURLRFT26IkOgIje7MCBA/jkk08gl8sxaNAg1KlTBzk5OTh58iT8/f1x5coVrFu3TiPnfv78OaKjo/H1119jzJgxGjmHo6Mjnj9/Dn19fY2M/zZ6enrIzMzEvn370KdPH5V9W7duhaGhIbKystQa+/79+5g9ezaqVauGBg0aFPm4I0eOqHU+IqLSxkSSqAy7ffs2fH194ejoiKNHj6Jy5crSvtGjR+PGjRs4cOCAxs7/6NEjAIClpaXGziGTyWBoaKix8d9GLpfD09MTP/74Y6FEctu2bejSpQt27dpVKrFkZmbC2NgYBgYGpXI+IqJ3xUvbRGVYUFAQMjIysGHDBpUksoCLiwvGjRsnvc7Ly8PcuXNRvXp1yOVyVKtWDV999RWys7NVjqtWrRq6du2KkydP4oMPPoChoSGcnZ2xefNmqc+sWbPg6OgIAPD394dMJkO1atUAvLgkXPDzy2bNmgWZTKbSFhERgRYtWsDS0hKmpqaoWbMmvvrqK2n/6+ZIHj16FC1btoSJiQksLS3Ro0cPxMfHv/J8N27cwODBg2FpaQkLCwsMGTIEmZmZr/9g/6N///44ePAgUlNTpbazZ8/i+vXr6N+/f6H+KSkpmDx5MurWrQtTU1OYm5ujU6dOuHjxotTn2LFjaNq0KQBgyJAh0iXygvfZpk0b1KlTB7GxsWjVqhWMjY2lz+W/cyT9/PxgaGhY6P17e3vDysoK9+/fL/J7JSIqSUwkicqwffv2wdnZGc2bNy9S/88++wwzZ85Eo0aNsHTpUrRu3RqBgYHw9fUt1PfGjRv4+OOP0b59eyxevBhWVlYYPHgwrly5AgDw8fHB0qVLAQD9+vXDli1bEBwcXKz4r1y5gq5duyI7Oxtz5szB4sWL0b17d5w6deqNx/3222/w9vZGcnIyZs2ahYkTJ+L06dPw9PTEnTt3CvXv06cPnj59isDAQPTp0wehoaGYPXt2keP08fGBTCbD7t27pbZt27ahVq1aaNSoUaH+t27dQnh4OLp27YolS5bA398fly9fRuvWraWkzs3NDXPmzAEAjBgxAlu2bMGWLVvQqlUraZzHjx+jU6dOaNCgAYKDg9G2bdtXxrds2TJUqlQJfn5+yM/PBwCsXbsWR44cwfLly2Fvb1/k90pEVKKURFQmpaWlKQEoe/ToUaT+cXFxSgDKzz77TKV98uTJSgDKo0ePSm2Ojo5KAMqoqCipLTk5WSmXy5WTJk2S2m7fvq0EoFy4cKHKmH5+fkpHR8dCMXzzzTfKl79Wli5dqgSgfPTo0WvjLjjHpk2bpLYGDRoobWxslI8fP5baLl68qNTR0VEOGjSo0PmGDh2qMmavXr2UFSpUeO05X34fJiYmSqVSqfz444+V7dq1UyqVSmV+fr7Szs5OOXv27Fd+BllZWcr8/PxC70MulyvnzJkjtZ09e7bQeyvQunVrJQDlmjVrXrmvdevWKm2HDx9WAlDOmzdPeevWLaWpqamyZ8+eb32PRESaxIokURmVnp4OADAzMytS/19//RUAMHHiRJX2SZMmAUChuZTu7u5o2bKl9LpSpUqoWbMmbt26pXbM/1Uwt/KXX36BQqEo0jEPHjxAXFwcBg8eDGtra6m9Xr16aN++vfQ+XzZy5EiV1y1btsTjx4+lz7Ao+vfvj2PHjiEpKQlHjx5FUlLSKy9rAy/mVerovPj6zM/Px+PHj6XL9ufPny/yOeVyOYYMGVKkvh06dMDnn3+OOXPmwMfHB4aGhli7dm2Rz0VEpAlMJInKKHNzcwDA06dPi9T/77//ho6ODlxcXFTa7ezsYGlpib///lulvWrVqoXGsLKywpMnT9SMuLC+ffvC09MTn332GWxtbeHr64sdO3a8MaksiLNmzZqF9rm5ueHff//Fs2fPVNr/+16srKwAoFjvpXPnzjAzM8NPP/2ErVu3omnTpoU+ywIKhQJLly6Fq6sr5HI5KlasiEqVKuHSpUtIS0sr8jkdHByKdWPNokWLYG1tjbi4OISEhMDGxqbIxxIRaQITSaIyytzcHPb29vjzzz+Lddx/b3Z5HV1d3Ve2K5VKtc9RMH+vgJGREaKiovDbb79h4MCBuHTpEvr27Yv27dsX6vsu3uW9FJDL5fDx8UFYWBj27Nnz2mokACxYsAATJ05Eq1at8MMPP+Dw4cOIiIhA7dq1i1x5BV58PsVx4cIFJCcnAwAuX75crGOJiDSBiSRRGda1a1fcvHkT0dHRb+3r6OgIhUKB69evq7Q/fPgQqamp0h3YJcHKykrlDucC/616AoCOjg7atWuHJUuW4OrVq5g/fz6OHj2K33///ZVjF8SZkJBQaN+1a9dQsWJFmJiYvNsbeI3+/fvjwoULePr06StvUCqwc+dOtG3bFhs2bICvry86dOgALy+vQp9JUZP6onj27BmGDBkCd3d3jBgxAkFBQTh79myJjU9EpA4mkkRl2JQpU2BiYoLPPvsMDx8+LLT/5s2bWLZsGYAXl2YBFLqzesmSJQCALl26lFhc1atXR1paGi5duiS1PXjwAHv27FHpl5KSUujYgoW5/7skUYHKlSujQYMGCAsLU0nM/vzzTxw5ckR6n5rQtm1bzJ07FytWrICdnd1r++nq6haqdv7888+4d++eSltBwvuqpLu4pk6disTERISFhWHJkiWoVq0a/Pz8Xvs5EhGVBi5ITlSGVa9eHdu2bUPfvn3h5uam8mSb06dP4+eff8bgwYMBAPXr14efnx/WrVuH1NRUtG7dGmfOnEFYWBh69uz52qVl1OHr64upU6eiV69eGDt2LDIzM7F69WrUqFFD5WaTOXPmICoqCl26dIGjoyOSk5OxatUq/O9//0OLFi1eO/7ChQvRqVMneHh4YNiwYXj+/DmWL18OCwsLzJo1q8Tex3/p6Ohg+vTpb+3XtWtXzJkzB0OGDEHz5s1x+fJlbN26Fc7Ozir9qlevDktLS6xZswZmZmYwMTFBs2bN4OTkVKy4jh49ilWrVuGbb76RliPatGkT2rRpgxkzZiAoKKhY4xERlRRWJInKuO7du+PSpUv4+OOP8csvv2D06NGYNm0a7ty5g8WLFyMkJETqu379esyePRtnz57F+PHjcfToUQQEBGD79u0lGlOFChWwZ88eGBsbY8qUKQgLC0NgYCC6detWKPaqVati48aNGD16NFauXIlWrVrh6NGjsLCweO34Xl5eOHToECpUqICZM2di0aJF+PDDD3Hq1KliJ2Ga8NVXX2HSpEk4fPgwxo0bh/Pnz+PAgQOoUqWKSj99fX2EhYVBV1cXI0eORL9+/XD8+PFinevp06cYOnQoGjZsiK+//lpqb9myJcaNG4fFixfjjz/+KJH3RURUXDJlcWajExERERH9f6xIEhEREZFamEgSERERkVqYSBIRERGRWphIEhEREZFamEgSERERkVqYSBIRERGRWphIEhEREZFayuW
2024-11-15 21:25:43 +04:00
"text/plain": [
"<Figure size 800x600 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-11-15 21:26:00 +04:00
"Смещение: 0.8684929985365014\n",
"Дисперсия: 0.003137100883428496\n"
2024-11-15 21:25:43 +04:00
]
}
],
"source": [
2024-11-16 09:40:09 +04:00
"\n",
2024-11-15 21:25:43 +04:00
"# Конвейер для случайного леса\n",
"pipeline_ranfor = Pipeline([\n",
" ('preprocessing', preprocessing),\n",
" ('classifier', RandomForestClassifier())\n",
"])\n",
"\n",
"# Определение сетки гиперпараметров\n",
"param_grid = {\n",
" #Количество деревьев в лесу\n",
" 'classifier__n_estimators': [50, 100, 200],\n",
" #Максимальная глубина дерева\n",
" 'classifier__max_depth': [10, 20, 30],\n",
" #Минимальное количество образцов для листового узла\n",
" 'classifier__min_samples_leaf': [1, 2, 4]\n",
"}\n",
"\n",
"# Создание объекта GridSearchCV\n",
"grid_search = GridSearchCV(pipeline_ranfor, param_grid, cv=5, scoring='accuracy', n_jobs=-1)\n",
"\n",
"# Обучение модели с перебором гиперпараметров\n",
"grid_search.fit(X_train, y_train)\n",
"\n",
"print(\"Лучшие гиперпараметры: \", grid_search.best_params_)\n",
"\n",
"# Лучшая модель случайного леса\n",
"best_model = grid_search.best_estimator_\n",
"\n",
"# Использование и оценка лучшей модели\n",
"y_pred_proba = best_model.predict_proba(X_test)[:, 1]\n",
"print(f'ROC у метода случайного леса = {roc_auc_score(y_test, y_pred_proba)}')\n",
"\n",
"y_pred = best_model.predict(X_test)\n",
"print(f'Точность = {accuracy_score(y_test, y_pred)}')\n",
"\n",
"fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)\n",
"\n",
"# построение ROC кривой\n",
"plt.plot(fpr, tpr)\n",
"plt.ylabel('True Positive Rate')\n",
"plt.xlabel('False Positive Rate')\n",
"plt.show()\n",
"\n",
"# Построение матрицы ошибок\n",
"conf_matrix = confusion_matrix(y_test, y_pred)\n",
"\n",
"# Визуализация матрицы ошибок\n",
"plt.figure(figsize=(8, 6))\n",
"sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', \n",
" xticklabels=['Предсказанный \"безопасный\"', 'Предсказанный \"опасный\"'], \n",
" yticklabels=['Действительно \"безопасный\"', 'Действительно \"опасный\"'])\n",
"plt.title('Confusion Matrix')\n",
"plt.ylabel('Actual')\n",
"plt.xlabel('Predicted')\n",
"plt.show()\n",
"\n",
"# Оценка дисперсии и смещения\n",
"cv_results = grid_search.cv_results_\n",
"mean_test_score = cv_results['mean_test_score']\n",
"std_test_score = cv_results['std_test_score']\n",
"\n",
"print(f\"Смещение: {mean_test_score.mean()}\")\n",
2024-11-16 09:40:09 +04:00
"print(f\"Дисперсия: {std_test_score.mean()}\")\n"
2024-11-15 21:25:43 +04:00
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Градиентный бустинг"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Лучшие гиперпараметры: {'classifier__learning_rate': 0.1, 'classifier__max_depth': 3, 'classifier__n_estimators': 300, 'classifier__subsample': 0.5}\n",
"ROC у метода градиентного спуска = 0.9012421336337971\n",
"Точность = 0.872737556561086\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjcAAAGwCAYAAABVdURTAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABC8klEQVR4nO3dfXzO9eLH8fe12a4N29Da5maam9zlZm7iIIlWVEecTuWUg9TRKTd12lEhLCWcinSinJRUv4rqqJxoDoqDdMisFCY3i7CxsDtsdl3f3x/LVzuGXXNd1/e6rr2ej8f1eHy+332/1/XeF13vvrc2wzAMAQAABIggqwMAAAC4E+UGAAAEFMoNAAAIKJQbAAAQUCg3AAAgoFBuAABAQKHcAACAgFLN6gDe5nQ6dfDgQUVERMhms1kdBwAAVIBhGMrPz1e9evUUFHThfTNVrtwcPHhQ8fHxVscAAACVsH//fjVo0OCCy1S5chMRESGpdONERkZanAYAAFREXl6e4uPjze/xC6ly5ebMoajIyEjKDQAAfqYip5RwQjEAAAgolBsAABBQKDcAACCgUG4AAEBAodwAAICAQrkBAAABhXIDAAACCuUGAAAEFMoNAAAIKJQbAAAQUCwtN//5z3/Ur18/1atXTzabTR9//PFF11m9erU6dOggu92upk2basGCBR7PCQAA/Iel5aawsFDt2rXTnDlzKrT83r17dcstt6hXr15KT0/XX/7yF/3pT3/S8uXLPZwUAAD4C0sfnHnTTTfppptuqvDyc+fOVaNGjTRjxgxJUsuWLbVu3Tq98MIL6tOnj6diAuU6nH9KxSVOq2MAgM8JrRakmIgwyz7fr54KvmHDBiUlJZWZ16dPH/3lL3857zpFRUUqKioyp/Py8jwVDwHktMOpBesz9dl3h1TiNBQeElzm5//de9SiZADg+zo0rKXFI7pb9vl+VW6ysrIUGxtbZl5sbKzy8vJ08uRJhYeHn7POtGnTNHnyZG9FhJ/bkZWnvrPWurSOvRrn5QPAr4UEW/vfRb8qN5Uxbtw4JScnm9N5eXmKj4+3MBF80f6jJ9Tj2S/K/dldnRuqQ8NaCg8tu/cmNjJMVyfU8UY8AIAL/KrcxMXFKTs7u8y87OxsRUZGlrvXRpLsdrvsdrs34sEPFZc41ePZz5WdV1Rmfv1a4fr3I9eqht2v/okAAORn5aZr165atmxZmXkrVqxQ165dLUoEf3fDC2vKFJuklrGaN6SjbDabhakAAJfC0nJTUFCgXbt2mdN79+5Venq66tSpo4YNG2rcuHE6cOCA3nrrLUnSAw88oNmzZ+uxxx7Tvffeq88//1zvv/++li5datWvAD/2RcZh/fjzCXM6Y0pf2asFX2ANAIA/sPSMn6+//lrt27dX+/btJUnJyclq3769Jk2aJEk6dOiQ9u3bZy7fqFEjLV26VCtWrFC7du00Y8YMvfbaa1wGDpdt/vGYhr2xyZz+bnIfig0ABAibYRiG1SG8KS8vT1FRUcrNzVVkZKTVceBlmTmFuu751WXmvXVvZ13b7HJrAgEAKsSV72+uYUWV8c3+4+cUm+E9GlFsACDA+NUJxcCl6D9nvTm+ql6kFo/oxqEoAAhAlBtUCb9+TMJNreP0yh87WpgGAOBJHJZCwDtZ7NCAX+21mX5bWwvTAAA8jT03CGh3zt2gjZllnwMVVT3EojQAAG9gzw0CVkFRyTnF5pOR1j3IDQDgHey5QcC691f3sfkm5UZFhbPHBgCqAvbcICDtPlJQZq8NxQYAqg7KDQJOUYlD189YY06n/qWHhWkAAN5GuUHAmf352eeVtakfpRZx3IkaAKoSyg0CyoL1e/XSr8rNklGcQAwAVQ0nFCMg7D96Qj2e/aLMvGd/31Y2m82iRAAAq7DnBgEhff/xMtPjbmqhOzo1sCYMAMBS7LlBQHjsw28lSS3iIpT6l2stTgMAsBJ7buD3Hnh7s06edkiS6kaFWZwGAGA1yg382pe7cpT6fZY5PXcwD8QEgKqOcgO/dbSwWHe/9l9z+t+PXCt7tWALEwEAfAHlBn6rw9MrzHFKv1ZqFhthYRoAgK+g3MDvNb68hoZ1b2R1DACAj6DcwO/sP3pCCWOXmtMf/LmrhWkAAL6GcgO/k/x+epnpOjVCrQkCAPBJlBv4leISpzZlHjOndzzdl7sQAwDKoNzAr+SePG2O1z7WS2EhXB0FACiLcgO/8t2BXHMcX6e6hUkAAL6KcgO/MmzBJqsjAAB8HOUGfuWyX04eHtY9wdogAACfRbmB3ygoKtHPhcWSpP6J9S1OAwDwVZQb+I2l3x40xy3iuBsxAKB8lBv4jaOFpVdKBdnEVVIAgPOi3MAv5J06rb+l7pAkdW8abXEaAIAvo9zA5x3KPam2T/7bnL7xqjgL0wAAfB3lBj6v67TPzXHnhDoa/JsrLEwDAPB1lBv4tLc3ZJrjGqHBeu/+31gXBgDgF6pZHQA4n9fW7tGUpdvN6a8n3KDgIJ4jBQC4MPbcwCcVFpWUKTbvDf+NwkO5QgoAcHHsuYFP+ve2LHO86P7fqEvjyyxMAwDwJ+y5gU965pe9NmEhQRQbAIBLKDfwOUUlDuUUlD5mIeGyGhanAQD4G8oNfM7/fbXPHL/yx44WJgEA+CPKDXzO059uM8eNotlzAwBwDeUGPmXLvmPmeHTvphYmAQD4K8oNfMrvXv7SHI/sRbkBALiOcgOfkfjU2edHPdqnOU/+BgBUCuUGPuFYYbGOnzhtTo+4romFaQAA/oxyA5/Q/ukV5viHZ26SzcZjFgAAlUO5geUWbjx76XdifC2FBPPXEgBQeXyLwHKT/3X20u+PRnSzMAkAIBBQbmC5OjVCJUkP9GzC4SgAwCWj3MBSye+n68Dxk5Kk3i1iLE4DAAgElBtY5ouMw1qcdsCcbh4bYWEaAECgoNzAMsPe2GSONz5xvaKqh1iYBgAQKCg3sMTJYoc5vqtzQ8VEhFmYBgAQSCg3sMQTH209O76lpYVJAACBhnIDS2zY87Mkqaa9mmraq1mcBgAQSCg38Lqfjp3QodxTkqSPR3a3OA0AINBQbuB11/ztC3PcOLqGhUkAAIGIcgOvOu1wmuPOjeooKIib9gEA3MvycjNnzhwlJCQoLCxMXbp00caNGy+4/KxZs9S8eXOFh4crPj5ejzzyiE6dOuWltLhUa384Yo5fG9rJwiQAgEBlablZtGiRkpOTlZKSorS0NLVr1059+vTR4cOHy13+3Xff1dixY5WSkqLt27fr9ddf16JFizR+/HgvJ0dlFJU4dO+Cr83pGqGcSAwAcD9Ly83MmTM1fPhwDRs2TK1atdLcuXNVvXp1zZ8/v9zlv/zyS3Xv3l133323EhISdOONN+quu+664N6eoqIi5eXllXnBGuP+efby78f7tlAwh6QAAB5gWbkpLi7W5s2blZSUdDZMUJCSkpK0YcOGctfp1q2bNm/ebJaZPXv2aNmyZbr55pvP+znTpk1TVFSU+YqPj3fvLwKXRYWH6IGeja2OAQAIUJYdF8jJyZHD4VBsbGyZ+bGxsdqxY0e569x9993KycnRNddcI8MwVFJSogceeOCCh6XGjRun5ORkczovL4+CY4GDx09q8ZbS50gN7XoFT/8GAHiM5ScUu2L16tWaOnWqXn75ZaWlpWnx4sVaunSpnn766fOuY7fbFRkZWeYF7+s2/XNzfHWjOhYmAQAEOsv23ERHRys4OFjZ2dll5mdnZysuLq7cdSZOnKjBgwfrT3/6kySpTZs2Kiws1P33368nnnhCQUF+1dWqlMsj7DqSX6QWcRHqceXlVscBAAQwy9pAaGioOnbsqFWrVpnznE6nVq1apa5du5a7zokTJ84pMMHBwZIkwzA8FxaX5JP0AzqSXyRJmnlnorVhAAABz9JrcZOTkzV06FB16tRJnTt31qxZs1RYWKhhw4ZJkoYMGaL69etr2rRpkqR+/fpp5syZat++vbp06aJdu3Zp4sSJ6tevn1ly4FtOFJfo4YXp5nR8nXDrwgAAqgRLy83AgQN15MgRTZo0SVlZWUpMTFRqaqp5kvG+ffvK7Km
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Матрица ошибок:\n",
"[[1326 400]\n",
" [ 50 1760]]\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAApIAAAIjCAYAAACwHvu2AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAB2r0lEQVR4nO3dd1gUV9sG8HtpS19ABcQogljAiiWKWCOR2BUTRY1iSYwGK4pKYm8k2BC7sWCNMRZiiUaiURQRFUWNIrEGG2BEBERpu98ffuzrigVWloPs/cs118WeOTPz7BLXx+ecOSNRKBQKEBEREREVkY7oAIiIiIjow8REkoiIiIjUwkSSiIiIiNTCRJKIiIiI1MJEkoiIiIjUwkSSiIiIiNTCRJKIiIiI1MJEkoiIiIjUwkSSiIiIiNTCRJKI3uratWto3749ZDIZJBIJwsLCivX8t2/fhkQiQWhoaLGe90PWpk0btGnTRnQYRETvxESS6ANw48YNfPPNN3B0dIShoSHMzc3h7u6OxYsX49mzZxq9to+PDy5duoQ5c+Zg06ZNaNy4sUavV5IGDhwIiUQCc3Pz136O165dg0QigUQiwfz584t8/vv372P69OmIjY0thmiJiEofPdEBENHb7d+/H1988QWkUikGDBiAOnXqIDs7GydOnIC/vz8uX76M1atXa+Taz549Q1RUFL7//nuMGDFCI9ewt7fHs2fPoK+vr5Hzv4uenh4yMzOxd+9e9OrVS2Xfli1bYGhoiOfPn6t17vv372PGjBmoWrUqGjRoUOjjDh06pNb1iIhKGhNJolLs1q1b8Pb2hr29PY4cOYKKFSsq9/n6+uL69evYv3+/xq7/8OFDAICFhYXGriGRSGBoaKix87+LVCqFu7s7fv755wKJ5NatW9GpUyfs3LmzRGLJzMyEsbExDAwMSuR6RETvi0PbRKVYUFAQMjIysHbtWpUkMp+TkxNGjx6tfJ2bm4tZs2ahWrVqkEqlqFq1Kr777jtkZWWpHFe1alV07twZJ06cwMcffwxDQ0M4Ojpi48aNyj7Tp0+Hvb09AMDf3x8SiQRVq1YF8GJIOP/nl02fPh0SiUSlLTw8HC1atICFhQVMTU1Rs2ZNfPfdd8r9b5ojeeTIEbRs2RImJiawsLBAt27dEBcX99rrXb9+HQMHDoSFhQVkMhkGDRqEzMzMN3+wr+jbty8OHDiA1NRUZduZM2dw7do19O3bt0D/lJQUjB8/HnXr1oWpqSnMzc3RoUMHXLhwQdnn6NGjaNKkCQBg0KBByiHy/PfZpk0b1KlTBzExMWjVqhWMjY2Vn8urcyR9fHxgaGhY4P17enrC0tIS9+/fL/R7JSIqTkwkiUqxvXv3wtHREc2bNy9U/6+++gpTp05Fw4YNsWjRIrRu3RqBgYHw9vYu0Pf69ev4/PPP8emnn2LBggWwtLTEwIEDcfnyZQCAl5cXFi1aBADo06cPNm3ahODg4CLFf/nyZXTu3BlZWVmYOXMmFixYgK5duyIyMvKtx/3555/w9PREcnIypk+fDj8/P5w8eRLu7u64fft2gf69evVCeno6AgMD0atXL4SGhmLGjBmFjtPLywsSiQS7du1Stm3duhW1atVCw4YNC/S/efMmwsLC0LlzZyxcuBD+/v64dOkSWrdurUzqnJ2dMXPmTADA0KFDsWnTJmzatAmtWrVSnufRo0fo0KEDGjRogODgYLRt2/a18S1evBgVKlSAj48P8vLyAACrVq3CoUOHsGTJEtjZ2RX6vRIRFSsFEZVKT548UQBQdOvWrVD9Y2NjFQAUX331lUr7+PHjFQAUR44cUbbZ29srACgiIiKUbcnJyQqpVKoYN26csu3WrVsKAIp58+apnNPHx0dhb29fIIZp06YpXv5aWbRokQKA4uHDh2+MO/8a69evV7Y1aNBAYW1trXj06JGy7cKFCwodHR3FgAEDClxv8ODBKufs0aOHoly5cm+85svvw8TERKFQKBSff/65ol27dgqFQqHIy8tT2NraKmbMmPHaz+D58+eKvLy8Au9DKpUqZs6cqWw7c+ZMgfeWr3Xr1goAipUrV752X+vWrVXa/vjjDwUAxezZsxU3b95UmJqaKrp37/7O90hEpEmsSBKVUmlpaQAAMzOzQvX//fffAQB+fn4q7ePGjQOAAnMpXVxc0LJlS+XrChUqoGbNmrh586baMb8qf27lb7/9BrlcXqhjHjx4gNjYWAwcOBBWVlbK9nr16uHTTz9Vvs+XDRs2TOV1y5Yt8ejRI+VnWBh9+/bF0aNHkZiYiCNHjiAxMfG1w9rAi3mVOjovvj7z8vLw6NEj5bD9uXPnCn1NqVSKQYMGFapv+/bt8c0332DmzJnw8vKCoaEhVq1aVehrERFpAhNJolLK3NwcAJCenl6o/v/++y90dHTg5OSk0m5rawsLCwv8+++/Ku1VqlQpcA5LS0s8fvxYzYgL6t27N9zd3fHVV1/BxsYG3t7e2L59+1uTyvw4a9asWWCfs7Mz/vvvPzx9+lSl/dX3YmlpCQBFei8dO3aEmZkZfvnlF2zZsgVNmjQp8Fnmk8vlWLRoEapXrw6pVIry5cujQoUKuHjxIp48eVLoa1aqVKlIN9bMnz8fVlZWiI2NRUhICKytrQt9LBGRJjCRJCqlzM3NYWdnh7///rtIx716s8ub6OrqvrZdoVCofY38+Xv5jIyMEBERgT///BP9+/fHxYsX0bt3b3z66acF+r6P93kv+aRSKby8vLBhwwbs3r37jdVIAJg7dy78/PzQqlUrbN68GX/88QfCw8NRu3btQldegRefT1GcP38eycnJAIBLly4V6VgiIk1gIklUinXu3Bk3btxAVFTUO/va29tDLpfj2rVrKu1JSUlITU1V3oFdHCwtLVXucM73atUTAHR0dNCuXTssXLgQV65cwZw5c3DkyBH89ddfrz13fpzx8fEF9l29ehXly5eHiYnJ+72BN+jbty/Onz+P9PT0196glG/Hjh1o27Yt1q5dC29vb7Rv3x4eHh4FPpPCJvWF8fTpUwwaNAguLi4YOnQogoKCcObMmWI7PxGROphIEpViEyZMgImJCb766iskJSUV2H/jxg0sXrwYwIuhWQAF7qxeuHAhAKBTp07FFle1atXw5MkTXLx4Udn24MED7N69W6VfSkpKgWPzF+Z+dUmifBUrVkSDBg2wYcMGlcTs77//xqFDh5TvUxPatm2LWbNmYenSpbC1tX1jP11d3QLVzl9//RX37t1TactPeF+XdBfVxIkTkZCQgA0bNmDhwoWoWrUqfHx83vg5EhGVBC5ITlSKVatWDVu3bkXv3r3h7Oys8mSbkydP4tdff8XAgQMBAPXr14ePjw9Wr16N1NRUtG7dGqdPn8aGDRvQvXv3Ny4tow5vb29MnDgRPXr0wKhRo5CZmYkVK1agRo0aKjebzJw5ExEREejUqRPs7e2RnJyM5cuX46OPPkKLFi3eeP558+ahQ4cOcHNzw5AhQ/Ds2TMsWbIEMpkM06dPL7b38SodHR1Mnjz5nf06d+6MmTNnYtCgQWjevDkuXbqELVu2wNHRUaVftWrVYGFhgZUrV8LMzAwmJiZo2rQpHBwcihTXkSNHsHz5ckybNk25HNH69evRpk0bTJkyBUFBQUU6HxFRcWFFkqiU69q1Ky5evIjPP/8cv/32G3x9fTFp0iTcvn0bCxYsQEhIiLLvmjVrMGPGDJw5cwZjxozBkSNHEBAQgG3bthVrTOXKlcPu3bthbGyMCRMmYMOGDQgMDESXLl0KxF6lShWsW7cOvr6+WLZsGVq1aoUjR45AJpO98fweHh44ePAgypUrh6lTp2L+/Plo1qwZIiMji5yEacJ3332HcePG4Y8//sDo0aNx7tw57N+/H5UrV1bpp6+vjw0bNkBXVxfDhg1Dnz59cOzYsSJdKz09HYMHD4arqyu+//57ZXvLli0xevRoLFiwAKdOnSqW90VEVFQSRVFmoxMRERER/T9WJImIiIhILUwkiYiIiEgtTCSJiIiISC1MJImIiIhILUwkiYiIiEgtTCSJiIiISC1MJImIiIhILWX
"text/plain": [
"<Figure size 800x600 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Смещение: 0.8811650848575816\n",
"Дисперсия: 0.008658656436943876\n"
]
}
],
"source": [
"# Конвейер\n",
"pipeline_grad = Pipeline([\n",
" ('preprocessing', preprocessing),\n",
" ('classifier', GradientBoostingClassifier())\n",
"])\n",
"\n",
"# Определение сетки гиперпараметров\n",
"param_grid = {\n",
" 'classifier__n_estimators': [100, 200, 300],\n",
" #Скорость обучения\n",
" 'classifier__learning_rate': [0.1, 0.2],\n",
" #Максимальная глубина дерева\n",
" 'classifier__max_depth': [3, 5, 7],\n",
" 'classifier__subsample': [0.1, 0.5, 1.0],\n",
"}\n",
"\n",
"# Создание объекта GridSearchCV\n",
"grid_search = GridSearchCV(pipeline_grad, param_grid, cv=2, scoring='roc_auc', n_jobs=-1)\n",
"\n",
"# Обучение модели с перебором гиперпараметров\n",
"grid_search.fit(X_train, y_train)\n",
"\n",
"print(\"Лучшие гиперпараметры: \", grid_search.best_params_)\n",
"\n",
"# Лучшая модель\n",
"best_model = grid_search.best_estimator_\n",
"\n",
"# Использование и оценка лучшей модели\n",
"y_pred_proba = best_model.predict_proba(X_test)[:, 1]\n",
"print(f'ROC у метода градиентного спуска = {roc_auc_score(y_test, y_pred_proba)}')\n",
"\n",
"y_pred = best_model.predict(X_test)\n",
"print(f'Точность = {accuracy_score(y_test, y_pred)}')\n",
"\n",
"fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)\n",
"\n",
"# построение ROC кривой\n",
"plt.plot(fpr, tpr)\n",
"plt.ylabel('True Positive Rate')\n",
"plt.xlabel('False Positive Rate')\n",
"plt.show()\n",
"\n",
"# Построение матрицы ошибок\n",
"conf_matrix = confusion_matrix(y_test, y_pred)\n",
"\n",
"# Визуализация матрицы ошибок\n",
"plt.figure(figsize=(8, 6))\n",
"sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', \n",
" xticklabels=['Предсказанный \"безопасный\"', 'Предсказанный \"опасный\"'], \n",
" yticklabels=['Действительно \"безопасный\"', 'Действительно \"опасный\"'])\n",
"plt.title('Confusion Matrix')\n",
"plt.ylabel('Actual')\n",
"plt.xlabel('Predicted')\n",
"plt.show()\n",
"\n",
"# Оценка дисперсии и смещения\n",
"cv_results = grid_search.cv_results_\n",
"mean_test_score = cv_results['mean_test_score']\n",
"std_test_score = cv_results['std_test_score']\n",
"\n",
"print(f\"Смещение: {mean_test_score.mean()}\")\n",
"print(f\"Дисперсия: {std_test_score.mean()}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Вывод**:\n",
"\n",
"Все модели классификации показали хорошие результаты, но лучший показатель точности у случайного леса. При этом все рассмотренные модели немного не дотянули до показателя точности в 90%. Дополнительая настройка гиперпараметров могла бы приблизить значение оценки ещё ближе к 90% "
]
}
],
"metadata": {
"kernelspec": {
"display_name": "aimenv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}