2024-11-19 19:06:25 +04:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Вариант: Список людей. "
]
},
{
"cell_type": "code",
2024-11-23 12:30:19 +04:00
"execution_count": 63,
2024-11-19 19:06:25 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
2024-11-22 16:07:46 +04:00
"RangeIndex: 10000 entries, 0 to 9999\n",
2024-11-19 19:06:25 +04:00
"Data columns (total 10 columns):\n",
2024-11-22 16:07:46 +04:00
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Id 10000 non-null object \n",
" 1 Name 10000 non-null object \n",
" 2 Short description 9996 non-null object \n",
" 3 Gender 9927 non-null object \n",
" 4 Country 9721 non-null object \n",
" 5 Occupation 9836 non-null object \n",
" 6 Birth year 10000 non-null int64 \n",
" 7 Death year 9999 non-null float64\n",
" 8 Manner of death 1893 non-null object \n",
" 9 Age of death 9999 non-null float64\n",
2024-11-19 19:06:25 +04:00
"dtypes: float64(2), int64(1), object(7)\n",
2024-11-22 16:07:46 +04:00
"memory usage: 781.4+ KB\n"
2024-11-19 19:06:25 +04:00
]
}
],
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.preprocessing import LabelEncoder\n",
2024-11-23 12:30:19 +04:00
"from sklearn import metrics\n",
2024-11-19 19:06:25 +04:00
"from imblearn.over_sampling import RandomOverSampler\n",
"from imblearn.under_sampling import RandomUnderSampler\n",
2024-11-22 16:07:46 +04:00
"from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
2024-11-23 12:30:19 +04:00
"from sklearn.metrics import ConfusionMatrixDisplay\n",
2024-11-22 16:07:46 +04:00
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.linear_model import LinearRegression, LogisticRegression\n",
"from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier\n",
"from sklearn.model_selection import train_test_split, GridSearchCV\n",
2024-11-23 12:30:19 +04:00
"from sklearn.metrics import (\n",
" precision_score, recall_score, accuracy_score, roc_auc_score, f1_score,\n",
" matthews_corrcoef, cohen_kappa_score, confusion_matrix\n",
")\n",
"from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error\n",
2024-11-19 19:06:25 +04:00
"import numpy as np\n",
"import featuretools as ft\n",
2024-11-22 16:07:46 +04:00
"from sklearn.metrics import accuracy_score, classification_report\n",
2024-11-19 19:06:25 +04:00
"\n",
"# Функция для применения oversampling\n",
"def apply_oversampling(X, y):\n",
" oversampler = RandomOverSampler(random_state=42)\n",
" X_resampled, y_resampled = oversampler.fit_resample(X, y)\n",
" return X_resampled, y_resampled\n",
"\n",
"# Функция для применения undersampling\n",
"def apply_undersampling(X, y):\n",
" undersampler = RandomUnderSampler(random_state=42)\n",
" X_resampled, y_resampled = undersampler.fit_resample(X, y)\n",
" return X_resampled, y_resampled\n",
"\n",
"def split_stratified_into_train_val_test(\n",
" df_input,\n",
" stratify_colname=\"y\",\n",
" frac_train=0.6,\n",
" frac_val=0.15,\n",
" frac_test=0.25,\n",
" random_state=None,\n",
"):\n",
" \"\"\"\n",
" Splits a Pandas dataframe into three subsets (train, val, and test)\n",
" following fractional ratios provided by the user, where each subset is\n",
" stratified by the values in a specific column (that is, each subset has\n",
" the same relative frequency of the values in the column). It performs this\n",
" splitting by running train_test_split() twice.\n",
"\n",
" Parameters\n",
" ----------\n",
" df_input : Pandas dataframe\n",
" Input dataframe to be split.\n",
" stratify_colname : str\n",
" The name of the column that will be used for stratification. Usually\n",
" this column would be for the label.\n",
" frac_train : float\n",
" frac_val : float\n",
" frac_test : float\n",
" The ratios with which the dataframe will be split into train, val, and\n",
" test data. The values should be expressed as float fractions and should\n",
" sum to 1.0.\n",
" random_state : int, None, or RandomStateInstance\n",
" Value to be passed to train_test_split().\n",
"\n",
" Returns\n",
" -------\n",
" df_train, df_val, df_test :\n",
" Dataframes containing the three splits.\n",
" \"\"\"\n",
"\n",
" if frac_train + frac_val + frac_test != 1.0:\n",
" raise ValueError(\n",
" \"fractions %f, %f, %f do not add up to 1.0\"\n",
" % (frac_train, frac_val, frac_test)\n",
" )\n",
"\n",
" if stratify_colname not in df_input.columns:\n",
" raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
"\n",
" X = df_input # Contains all columns.\n",
" y = df_input[\n",
" [stratify_colname]\n",
" ] # Dataframe of just the column on which to stratify.\n",
"\n",
" # Split original dataframe into train and temp dataframes.\n",
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
" )\n",
"\n",
" # Split the temp dataframe into val and test dataframes.\n",
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
" df_val, df_test, y_val, y_test = train_test_split(\n",
" df_temp,\n",
" y_temp,\n",
" stratify=y_temp,\n",
" test_size=relative_frac_test,\n",
" random_state=random_state,\n",
" )\n",
"\n",
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
"\n",
" return df_train, df_val, df_test\n",
"\n",
"\n",
2024-11-22 16:07:46 +04:00
"df = pd.read_csv(\"../data/age.csv\", nrows=10000)\n",
2024-11-19 19:06:25 +04:00
"df.info()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Как бизнес-цели выделим следующие 2 варианта:\n",
" 1) GameDev. Создание игры про конкретного персонажа, живущего в конкретном временном промежутке в конкретной стране. \n",
" 2) Исследование зависимости длительности жизни от страны проживания.\n",
" \n",
"Поскольку именно эти бизнес-цели были выбраны в предыдущей лабораторной работе, будем их использовать.\n",
2024-11-22 16:07:46 +04:00
"Н о возникает проблема с 1 целью: её невозможно использовать для задачи классификации. Заменим е е на классификацию людей по возрастным группам, что может быть полезно для рекламных целей."
2024-11-19 19:06:25 +04:00
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Выполним подготовку данных"
]
},
{
"cell_type": "code",
2024-11-22 16:07:46 +04:00
"execution_count": 46,
2024-11-19 19:06:25 +04:00
"metadata": {},
"outputs": [],
"source": [
"df.fillna({\"Gender\": \"NaN\", \"Country\": \"NaN\", \"Occupation\" : \"NaN\", \"Manner of death\" : \"NaN\"}, inplace=True)\n",
"df = df.dropna()\n",
"df['Country'] = df['Country'].str.split('; ')\n",
"df = df.explode('Country')\n",
"data = df.copy()\n",
"\n",
"value_counts = data[\"Country\"].value_counts()\n",
2024-11-22 16:07:46 +04:00
"rare = value_counts[value_counts < 100].index\n",
"data = data[~data[\"Country\"].isin(rare)]\n",
"\n",
"data.drop(data[~data['Gender'].isin(['Male', 'Female'])].index, inplace=True)\n",
"\n",
"data1 = pd.get_dummies(data, columns=['Gender', 'Country', 'Occupation'], drop_first=True)"
2024-11-19 19:06:25 +04:00
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
2024-11-22 16:07:46 +04:00
"Определить достижимый уровень качества модели для каждой задачи. Н а основе имеющихся данных уровень качества моделей не будет высоким, поскольку все таки длительность жизни лишь примерная и точно е е угадать невозможно."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Выберем ориентиры для наших 2х задач:\n",
" 1)Регрессии - средний возраст человека\n",
" 2)Классификации - аиболее часто встречающаяся возрастная группа"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Построим конвейер."
2024-11-19 19:06:25 +04:00
]
},
{
"cell_type": "code",
2024-11-22 16:07:46 +04:00
"execution_count": 33,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['Id', 'Name', 'Short description', 'Birth year', 'Death year',\n",
" 'Age of death', 'Gender_Male', 'Country_France',\n",
" 'Country_German Confederation', 'Country_German Democratic Republic',\n",
" ...\n",
" 'Manner of death_euthanasia', 'Manner of death_homicide',\n",
" 'Manner of death_homicide; natural causes',\n",
" 'Manner of death_internal bleeding', 'Manner of death_natural causes',\n",
" 'Manner of death_suicide',\n",
" 'Manner of death_suicide; homicide; accident',\n",
" 'Manner of death_suicide; unfortunate accident',\n",
" 'Manner of death_summary execution', 'Manner of death_unnatural death'],\n",
" dtype='object', length=400)\n"
]
}
],
"source": [
"print(data.columns)\n"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Best parameters for Linear Regression: {}\n",
"Best parameters for Random Forest Regressor: {'model__max_depth': None, 'model__n_estimators': 100}\n",
"Best parameters for Gradient Boosting Regressor: {'model__learning_rate': 0.2, 'model__max_depth': 7, 'model__n_estimators': 300}\n",
"Linear Regression: MSE = 0.002807184047660083, R2 = 0.9999899555289343\n",
"Random Forest Regressor: MSE = 11.46917740409879, R2 = 0.9589617856804076\n",
"Gradient Boosting Regressor: MSE = 8.202651735797296, R2 = 0.9706498410424512\n"
]
}
],
"source": [
"X_reg = data1.drop(['Id', 'Name', 'Age of death', 'Short description', 'Manner of death'], axis=1)\n",
"y_reg = data1['Age of death']\n",
"\n",
"# Разделение данных\n",
"X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)\n",
"\n",
"# Выбор моделей для регрессии\n",
"models_reg = {\n",
" 'Linear Regression': LinearRegression(),\n",
" 'Random Forest Regressor': RandomForestRegressor(random_state=42),\n",
" 'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=42)\n",
"}\n",
"\n",
"# Создание конвейера для регрессии\n",
"pipelines_reg = {}\n",
"for name, model in models_reg.items():\n",
" pipelines_reg[name] = Pipeline([\n",
" ('scaler', StandardScaler()),\n",
" ('model', model)\n",
" ])\n",
"\n",
"# Определение сетки гиперпараметров для регрессии\n",
"param_grids_reg = {\n",
" 'Linear Regression': {},\n",
" 'Random Forest Regressor': {\n",
" 'model__n_estimators': [100, 200, 300],\n",
" 'model__max_depth': [None, 10, 20, 30]\n",
" },\n",
" 'Gradient Boosting Regressor': {\n",
" 'model__n_estimators': [100, 200, 300],\n",
" 'model__learning_rate': [0.01, 0.1, 0.2],\n",
" 'model__max_depth': [3, 5, 7]\n",
" }\n",
"}\n",
"\n",
"# Настройка гиперпараметров для регрессии\n",
"best_models_reg = {}\n",
"for name, pipeline in pipelines_reg.items():\n",
" grid_search = GridSearchCV(pipeline, param_grids_reg[name], cv=5, scoring='neg_mean_squared_error')\n",
" grid_search.fit(X_train_reg, y_train_reg)\n",
" best_models_reg[name] = grid_search.best_estimator_\n",
" print(f'Best parameters for {name}: {grid_search.best_params_}')\n",
"\n",
2024-11-23 12:30:19 +04:00
"# Обучение моделей и оценка качества\n",
"for model_name in best_models_reg.keys():\n",
" print(f\"Model: {model_name}\")\n",
" model = best_models_reg[model_name][\"model\"]\n",
2024-11-22 16:07:46 +04:00
"\n",
2024-11-23 12:30:19 +04:00
" model_pipeline = Pipeline([(\"scaler\", StandardScaler()), (\"model\", model)])\n",
" model_pipeline = model_pipeline.fit(X_train_reg, y_train_reg)\n",
"\n",
" y_train_predict = model_pipeline.predict(X_train_reg)\n",
" y_test_predict = model_pipeline.predict(X_test_reg)\n",
"\n",
" best_models_reg[model_name][\"pipeline\"] = model_pipeline\n",
" best_models_reg[model_name][\"preds_train\"] = y_train_predict\n",
" best_models_reg[model_name][\"preds_test\"] = y_test_predict\n",
"\n",
" best_models_reg[model_name][\"MSE_train\"] = mean_squared_error(y_train_reg, y_train_predict)\n",
" best_models_reg[model_name][\"MSE_test\"] = mean_squared_error(y_test_reg, y_test_predict)\n",
" best_models_reg[model_name][\"R2_train\"] = r2_score(y_train_reg, y_train_predict)\n",
" best_models_reg[model_name][\"R2_test\"] = r2_score(y_test_reg, y_test_predict)\n",
" best_models_reg[model_name][\"MAE_train\"] = mean_absolute_error(y_train_reg, y_train_predict)\n",
" best_models_reg[model_name][\"MAE_test\"] = mean_absolute_error(y_test_reg, y_test_predict)"
2024-11-22 16:07:46 +04:00
]
},
{
"cell_type": "code",
"execution_count": 50,
2024-11-19 19:06:25 +04:00
"metadata": {},
"outputs": [],
"source": [
2024-11-22 16:07:46 +04:00
"data2 = data.drop(['Short description', 'Manner of death', 'Gender', 'Country', 'Occupation'], axis=1)"
]
},
{
"cell_type": "code",
2024-11-23 12:30:19 +04:00
"execution_count": 64,
2024-11-22 16:07:46 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['Birth year', 'Death year'], dtype='object')\n",
"Best parameters for Logistic Regression: {'model__C': 10, 'model__solver': 'lbfgs'}\n",
"Best parameters for Random Forest Classifier: {'model__max_depth': 30, 'model__n_estimators': 200}\n",
2024-11-23 12:30:19 +04:00
"Best parameters for Gradient Boosting Classifier: {'model__learning_rate': 0.1, 'model__max_depth': 7, 'model__n_estimators': 200}\n",
"Model: Logistic Regression\n"
2024-11-22 16:07:46 +04:00
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
"c:\\Users\\89176\\sourse\\MII\\Labas\\AIM-PIbd-31-Kozyrev-S-S\\aimvenv\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-11-23 12:30:19 +04:00
"Model: Random Forest Classifier\n",
"Model: Gradient Boosting Classifier\n"
2024-11-22 16:07:46 +04:00
]
}
],
"source": [
"# Создание возрастных групп\n",
"bins = [0, 18, 30, 50, 70, 100]\n",
"labels = ['0-18', '19-30', '31-50', '51-70', '71+']\n",
"data['Age Group'] = pd.cut(data['Age of death'], bins=bins, labels=labels)\n",
"\n",
"# Выбор признаков и целевой переменной для классификации\n",
"X_class = data2.drop(['Id', 'Name', 'Age of death', 'Age Group'], axis=1)\n",
"y_class = data['Age Group'] \n",
"print(X_class.columns)\n",
"# Разделение данных\n",
"X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)\n",
"\n",
"# Выбор моделей для классификации\n",
"models_class = {\n",
" 'Logistic Regression': LogisticRegression(random_state=42, max_iter=5000, solver='liblinear'),\n",
" 'Random Forest Classifier': RandomForestClassifier(random_state=42),\n",
" 'Gradient Boosting Classifier': GradientBoostingClassifier(random_state=42)\n",
"}\n",
"\n",
"# Создание конвейера для классификации\n",
"pipelines_class = {}\n",
"for name, model in models_class.items():\n",
" pipelines_class[name] = Pipeline([\n",
" ('scaler', StandardScaler()),\n",
" ('model', model)\n",
" ])\n",
"\n",
"# Определение сетки гиперпараметров для классификации\n",
"'''\n",
"param_grids_class = {\n",
" 'Logistic Regression': {\n",
" 'model__C': [0.1, 1, 10],\n",
" 'model__solver': ['lbfgs', 'liblinear']\n",
" },\n",
" 'Random Forest Classifier': {\n",
" 'model__n_estimators': [100, 200, 300],\n",
" 'model__max_depth': [None, 10, 20, 30]\n",
" },\n",
" 'Gradient Boosting Classifier': {\n",
" 'model__n_estimators': [100, 200, 300],\n",
" 'model__learning_rate': [0.01, 0.1, 0.2],\n",
" 'model__max_depth': [3, 5, 7]\n",
" }\n",
"}'''\n",
"# Убрал определение параметров поскольку уже был предподсчет данных, но вылетела ошибка. Сохранил лучшие параметры\n",
"\n",
"param_grids_class = {\n",
" 'Logistic Regression': {\n",
" 'model__C': [10],\n",
" 'model__solver': ['lbfgs']\n",
" },\n",
" 'Random Forest Classifier': {\n",
" 'model__n_estimators': [200],\n",
" 'model__max_depth': [ 30]\n",
" },\n",
" 'Gradient Boosting Classifier': {\n",
" 'model__n_estimators': [200],\n",
" 'model__learning_rate': [0.1],\n",
" 'model__max_depth': [7]\n",
" }\n",
"}\n",
"\n",
"# Настройка гиперпараметров для классификации\n",
"best_models_class = {}\n",
"for name, pipeline in pipelines_class.items():\n",
2024-11-23 12:30:19 +04:00
" grid_search = GridSearchCV(pipeline, param_grids_class[name], cv=5, scoring='accuracy', n_jobs=-1)\n",
2024-11-22 16:07:46 +04:00
" grid_search.fit(X_train_class, y_train_class)\n",
2024-11-23 12:30:19 +04:00
" best_models_class[name] = {\"model\": grid_search.best_estimator_}\n",
2024-11-22 16:07:46 +04:00
" print(f'Best parameters for {name}: {grid_search.best_params_}')\n",
"\n",
2024-11-23 12:30:19 +04:00
"# Обучение моделей и оценка качества\n",
"for model_name in best_models_class.keys():\n",
" print(f\"Model: {model_name}\")\n",
" model = best_models_class[model_name][\"model\"]\n",
"\n",
" model_pipeline = Pipeline([(\"scaler\", StandardScaler()), (\"model\", model)])\n",
" model_pipeline = model_pipeline.fit(X_train_class, y_train_class)\n",
"\n",
" y_train_predict = model_pipeline.predict(X_train_class)\n",
" y_test_probs = model_pipeline.predict_proba(X_test_class)\n",
" y_test_predict = model_pipeline.predict(X_test_class)\n",
"\n",
" best_models_class[model_name][\"pipeline\"] = model_pipeline\n",
" best_models_class[model_name][\"probs\"] = y_test_probs\n",
" best_models_class[model_name][\"preds\"] = y_test_predict\n",
"\n",
" best_models_class[model_name][\"Precision_train\"] = precision_score(y_train_class, y_train_predict, average='weighted')\n",
" best_models_class[model_name][\"Precision_test\"] = precision_score(y_test_class, y_test_predict, average='weighted')\n",
" best_models_class[model_name][\"Recall_train\"] = recall_score(y_train_class, y_train_predict, average='weighted')\n",
" best_models_class[model_name][\"Recall_test\"] = recall_score(y_test_class, y_test_predict, average='weighted')\n",
" best_models_class[model_name][\"Accuracy_train\"] = accuracy_score(y_train_class, y_train_predict)\n",
" best_models_class[model_name][\"Accuracy_test\"] = accuracy_score(y_test_class, y_test_predict)\n",
" best_models_class[model_name][\"ROC_AUC_test\"] = roc_auc_score(y_test_class, y_test_probs, multi_class='ovr')\n",
" best_models_class[model_name][\"F1_train\"] = f1_score(y_train_class, y_train_predict, average='weighted')\n",
" best_models_class[model_name][\"F1_test\"] = f1_score(y_test_class, y_test_predict, average='weighted')\n",
" best_models_class[model_name][\"MCC_test\"] = matthews_corrcoef(y_test_class, y_test_predict)\n",
" best_models_class[model_name][\"Cohen_kappa_test\"] = cohen_kappa_score(y_test_class, y_test_predict)\n",
" best_models_class[model_name][\"Confusion_matrix\"] = confusion_matrix(y_test_class, y_test_predict)"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAcAAAAQ9CAYAAADOEZ0TAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd1QU19sH8O/Sdll6L1IFRbEHI8FuJKIxlliiERPs0aCxxMbPbkSiJmrssQQ00RgTu1EUu0Y0FrAioqCgUkR6Xdi97x+8TBwB3UVgd9nnc86c4965M/Ncdt1n7507MwLGGAMhhBCiYbSUHQAhhBCiDJQACSGEaCRKgIQQQjQSJUBCCCEaiRIgIYQQjUQJkBBCiEaiBEgIIUQjUQIkhBCikSgBEkII0UiUAIlK6Nq1K7p27Vpj+3NxccGIESNqbH8EEAgEWLhwobLDIKTGUAIkPGFhYRAIBLh27ZqyQ3mrS5cuYeHChcjKyqrV47i4uEAgEHCLgYEB2rVrhx07dtTqcQkhtUtH2QEQAgAnTpxQeJtLly5h0aJFGDFiBExNTXnrYmNjoaVVc7/vWrdujW+//RYAkJycjK1btyIgIADFxcUYO3ZsjR1HlRUWFkJHh74ySP1Bn2aiEvT09Gp0f0KhsEb316BBAwwfPpx7PWLECDRs2BCrVq2q8wSYn58PAwODOj0mAIhEojo/JiG1iYZASbVERUWhV69eMDY2hqGhIbp3747Lly9XqHfr1i106dIF+vr6cHBwwJIlSxAaGgqBQIDHjx9z9So7B7h27Vo0a9YMYrEYZmZmaNu2LXbt2gUAWLhwIWbMmAEAcHV15YYny/dZ2TnArKwsTJ06FS4uLhAKhXBwcMCXX36J9PR0hdtvZWWFJk2a4NGjR7xymUyG1atXo1mzZhCJRLCxscFXX32FzMzMCvUWLlwIe3t7iMVidOvWDffu3asQd/mQ9Llz5/D111/D2toaDg4O3Ppjx46hU6dOMDAwgJGREXr37o27d+/yjpWSkoKRI0fCwcEBQqEQdnZ26NevH+/vf+3aNfj5+cHS0hL6+vpwdXXFqFGjePup7BygPJ+D8jb8888/mDZtGqysrGBgYIBPP/0UL168kPdPTkiNox4gUdjdu3fRqVMnGBsbY+bMmdDV1cXPP/+Mrl274ty5c/D29gYAPHv2DN26dYNAIEBQUBAMDAywdetWuXpnW7ZswTfffINBgwZh8uTJKCoqwq1bt3DlyhUMGzYMAwYMwIMHD/D7779j1apVsLS0BFCWmCqTl5eHTp06ISYmBqNGjcJ7772H9PR0HDp0CE+fPuW2l1dpaSmePn0KMzMzXvlXX32FsLAwjBw5Et988w0SEhKwbt06REVF4Z9//oGuri4AICgoCMuXL0efPn3g5+eHmzdvws/PD0VFRZUe7+uvv4aVlRXmz5+P/Px8AMCvv/6KgIAA+Pn5YdmyZSgoKMDGjRvRsWNHREVFwcXFBQAwcOBA3L17F5MmTYKLiwvS0tIQERGBxMRE7nWPHj1gZWWF2bNnw9TUFI8fP8a+ffve+DeQ93NQbtKkSTAzM8OCBQvw+PFjrF69GhMnTsQff/yh0N+ekBrDCHlFaGgoA8CuXr1aZZ3+/fszPT099ujRI67s+fPnzMjIiHXu3JkrmzRpEhMIBCwqKoore/nyJTM3N2cAWEJCAlfepUsX1qVLF+51v379WLNmzd4Y64oVKyrsp5yzszMLCAjgXs+fP58BYPv27atQVyaTvfE4zs7OrEePHuzFixfsxYsX7Pbt2+yLL75gAFhgYCBX78KFCwwA27lzJ2/78PBwXnlKSgrT0dFh/fv359VbuHAhA8CLu/z96NixIystLeXKc3NzmampKRs7dixvHykpKczExIQrz8zMZADYihUrqmzf/v373/qeM8YYALZgwQLutbyfg/I2+Pr68v7WU6dOZdra2iwrK+uNxyWkttAQKFGIVCrFiRMn0L9/fzRs2JArt7Ozw7Bhw3Dx4kXk5OQAAMLDw+Hj44PWrVtz9czNzeHv7//W45iamuLp06e4evVqjcS9d+9etGrVCp9++mmFdQKB4K3bnzhxAlZWVrCyskKLFi3w66+/YuTIkVixYgVX588//4SJiQk++ugjpKenc4uXlxcMDQ1x5swZAMCpU6dQWlqKr7/+mneMSZMmVXn8sWPHQltbm3sdERGBrKwsfP7557xjaWtrw9vbmzuWvr4+9PT0cPbs2QrDsOXKJxAdOXIEJSUlb/1bAIp9DsqNGzeO97fu1KkTpFIpnjx5ItcxCalplACJQl68eIGCggJ4eHhUWNe0aVPIZDIkJSUBAJ48eQJ3d/cK9Sore92sWbNgaGiIdu3aoVGjRggMDMQ///xT7bgfPXqE5s2bV3t7b29vREREIDw8HD/88ANMTU2RmZnJm7wTFxeH7OxsWFtbc8myfMnLy0NaWhoAcF/4r/8dzM3NKwyplnN1deW9jouLAwB8+OGHFY514sQJ7lhCoRDLli3DsWPHYGNjg86dO2P58uVISUnh9tWlSxcMHDgQixYtgqWlJfr164fQ0FAUFxdX+fdQ5HNQzsnJife6vK1VJWZCahudAyQqqWnTpoiNjcWRI0cQHh6OvXv3YsOGDZg/fz4WLVpU5/FYWlrC19cXAODn54cmTZrgk08+wU8//YRp06YBKJvYYm1tjZ07d1a6j6rOT8pDX1+f91omkwEoOw9oa2tbof6rlytMmTIFffr0wYEDB3D8+HHMmzcPISEhOH36NNq0aQOBQIC//voLly9fxuHDh3H8+HGMGjUKP/74Iy5fvgxDQ8Nqx/2qV3uwr2KM1cj+CVEUJUCiECsrK4jFYsTGxlZYd//+fWhpacHR0REA4OzsjIcPH1aoV1lZZQwMDDBkyBAMGTIEEokEAwYMQHBwMIKCgiASieQauizn5uaGO3fuyF3/bXr37o0uXbpg6dKl+Oqrr2BgYAA3NzecPHkSHTp0qJCwXuXs7Ayg7O/was/u5cuXcveG3NzcAADW1tZcYn5b/W+//Rbffvst4uLi0Lp1a/z444/47bffuDoffPABPvjgAwQHB2PXrl3w9/fH7t27MWbMmAr7U+RzQIiqoiFQohBtbW306NEDBw8e5E2jT01Nxa5du9CxY0cYGxsDKOspRUZGIjo6mquXkZFRZQ/pVS9fvuS91tPTg6enJxhj3Hmq8mvh5LkTzMCBA3Hz5k3s37+/wrrq9kBmzZqFly9fYsuWLQCAzz77DFKpFN99912FuqWlpVyc3bt3h46ODjZu3Mirs27dOrmP7efnB2NjYyxdurTS83bllxcUFBRUmFnq5uYGIyMjbogzMzOzwt+g/LxtVcOginwOCFFV1AMklfrll18QHh5eoXzy5MlYsmQJIiIi0LFjR3z99dfQ0dHBzz//jOLiYixfvpyrO3PmTPz222/46KOPMGnSJO4yCCcnJ2RkZLyxB9ejRw/Y2tqiQ4cOsLGxQUxMDNatW4fevXvDyMgIAODl5QUAmDNnDoYOHQpdXV306dOn0ovEZ8yYgb/++guDBw/GqFGj4OXlhYyMDBw6dAibNm1Cq1atFP4b9erVC82bN8fKlSsRGBiILl264KuvvkJISAiio6PRo0cP6OrqIi4uDn/++Sd++uknDBo0CDY2Npg8eTJ+/PFH9O3bFz179sTNmzdx7NgxWFpaytWzNTY2xsaNG/HFF1/gvffew9ChQ2FlZYXExET8/fff6NChA9atW4cHDx6ge/fu+Oyzz+Dp6QkdHR3s378fqampGDp0KABg+/bt2LBhAz799FO4ubkhNzcXW7ZsgbGxMT7++OMqY5D3c0CIylLuJFSiasqnrFe1JCUlMcYYu3HjBvPz82OGhoZMLBazbt26sUuXLlXYX1RUFOvUqRMTCoXMwcGBhYSEsDVr1jAALCUlhav3+mUQP//8M+vcuTOzsLBgQqGQubm5sRkzZrDs7Gze/r/77jvWoEEDpqWlxbsk4vXLIBgruwRj4sSJrEGDBkxPT485ODiwgIAAlp6e/sa/ibOzM+vdu3el68LCwhgAFhoaypVt3ry
"text/plain": [
"<Figure size 1200x1000 with 6 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"num_models = len(best_models_class)\n",
"fig, ax = plt.subplots(num_models, 1, figsize=(12, 10), sharex=False, sharey=False)\n",
"for index, key in enumerate(best_models_class.keys()):\n",
" c_matrix = best_models_class[key][\"Confusion_matrix\"]\n",
" disp = ConfusionMatrixDisplay(\n",
" confusion_matrix=c_matrix, display_labels=[\"0-18\", \"19-30\", \"31-50\", \"51-70\", \"71+\"]\n",
" ).plot(ax=ax.flat[index])\n",
" disp.ax_.set_title(key)\n",
"\n",
"plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.1)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA+8AAAQ9CAYAAAAoK3yPAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3hT1RvA8W+SNuluKXRTaCllbyrIXoUyRBGQIcgSBBFZAoIKyBZQwAEURIaKssGBIltEKhsRlGmR2cHo3sn9/dEfgdCktNDSwft5nj5wzz05edOkOXlzzj1HpSiKghBCCCGEEEIIIQotdUEHIIQQQgghhBBCiOxJ8i6EEEIIIYQQQhRykrwLIYQQQgghhBCFnCTvQgghhBBCCCFEISfJuxBCCCGEEEIIUchJ8i6EEEIIIYQQQhRykrwLIYQQQgghhBCFnCTvQgghhBBCCCFEISfJuxBCCCGEEEIIUchJ8i5y5NKlS6hUKlauXFnQoYjH0K9fP/z8/Ao6jGJDpVLx/vvvF3QYBa558+Y0b97ceFwY3y8ejFGIwqYw/t2I3JN+NtP777+PSqXKUd0n0ZdKH5Bz0qcXbpK8C1auXIlKpeLIkSMFHUq+uduJ3P2xtrbGz8+P4cOHExMTU9DhCWDRokWoVCrq16//yG1cv36d999/nxMnTuRdYIXc3r17s7y2y5UrR58+ffj3338LOrxcOXDgAO+//778TYpiR/rZmIIO76l197V398fKygofHx/69evHtWvXCjo88QDp08XDWBV0AKJoKFu2LMnJyVhbWxd0KI9l8eLFODg4kJiYyK5du/j00085duwY+/fvL+jQnojPP/8cg8FQ0GGYtXr1avz8/Dh06BAXLlygfPnyuW7j+vXrTJkyBT8/P2rVqpX3QRZiw4cP55lnniE9PZ1jx46xdOlStm7dyl9//YW3t/cTjeVR3y8OHDjAlClT6NevHy4uLvkTnBCFlPSzxUNh7WenTp2Kv78/KSkp/PHHH6xcuZL9+/dz6tQpbGxs8vz+3nvvPcaPH5/n7T4tpE8XlsjIu8gRlUqFjY0NGo2moEOxKCkp6aF1unbtSu/evRk8eDDr1q2je/fu/P777xw6dOgJRHiPwWAgJSXlid4ngLW1NTqd7onf78OEh4dz4MAB5s2bh5ubG6tXry7okIqcJk2a0Lt3b/r378+nn37Khx9+yO3bt1m1apXF2yQmJuZLLEXh/UKIwqYo/N1IP/twhbWfbdeuHb1792bgwIEsW7aMMWPGcPHiRb7//vt8uT8rK6t8+VLgaSF9urBEkneRI+aud+nXrx8ODg5cu3aNTp064eDggJubG2PGjEGv15vc3mAwsGDBAqpWrYqNjQ0eHh4MHjyYO3fumNT77rvv6NChA97e3uh0OgICApg2bVqW9po3b061atU4evQoTZs2xc7OjnfeeSfXj6tJkyYAXLx40aT84MGDtG3bFmdnZ+zs7GjWrBm///57ltvv3buXoKAgbGxsCAgIYMmSJWav81KpVAwbNozVq1dTtWpVdDod27ZtA+DatWsMGDAADw8PdDodVatWZfny5Vnu69NPP6Vq1arY2dlRokQJgoKC+Oabb4zn4+PjGTlyJH5+fuh0Otzd3WndujXHjh0z1jF3LV5iYiJvvfUWvr6+6HQ6KlasyIcffoiiKGYfw5YtW6hWrZox1ruP435nzpzh8uXL5n7lZq1evZoSJUrQoUMHunbtajF5j4mJYdSoUcbHWLp0afr06cPNmzfZu3cvzzzzDAD9+/c3Tjm7+5r18/OjX79+Wdp88LqptLQ0Jk2aRN26dXF2dsbe3p4mTZqwZ8+eHD+euyIjI7GysmLKlClZzp09exaVSsVnn30GQHp6OlOmTCEwMBAbGxtKlixJ48aN2bFjR67vF6Bly5ZA5hcjcG9K699//83LL79MiRIlaNy4sbH+119/Td26dbG1tcXV1ZUePXpw5cqVLO0uXbqUgIAAbG1tqVevHr/99luWOpaujztz5gzdunXDzc0NW1tbKlasyLvvvmuMb+zYsQD4+/sbn79Lly7lS4xCFDbSz0o/m5/97IMsPS9nzpyha9euuLq6YmNjQ1BQUJYEPyf9lbnnKDU1lVGjRuHm5oajoyPPP/88V69ezRKbpXUDzLW5YsUKWrZsibu7OzqdjipVqrB48eIc/Q4e9nw/SPr0lSbl0qcXDJk2Lx6LXq8nJCSE+vXr8+GHH7Jz504++ugjAgICeP311431Bg8ezMqVK+nfvz/Dhw8nPDyczz77jOPHj/P7778bp+KsXLkSBwcHRo8ejYODA7t372bSpEnExcUxd+5ck/u+desW7dq1o0ePHvTu3RsPD49cx3/3TaREiRLGst27d9OuXTvq1q3L5MmTUavVxs7ht99+o169egAcP36ctm3b4uXlxZQpU9Dr9UydOhU3Nzez97V7927WrVvHsGHDKFWqFH5+fkRGRvLss88aO2w3Nzd+/vlnXn31VeLi4hg5ciSQOQ1v+PDhdO3alREjRpCSksLJkyc5ePAgL7/8MgBDhgxhw4YNDBs2jCpVqnDr1i3279/PP//8Q506dczGpCgKzz//PHv27OHVV1+lVq1a/PLLL4wdO5Zr164xf/58k/r79+9n06ZNDB06FEdHRz755BO6dOnC5cuXKVmypLFe5cqVadasGXv37s3R87B69Wo6d+6MVqulZ8+eLF68mMOHDxuTcYCEhASaNGnCP//8w4ABA6hTpw43b97k+++/5+rVq1SuXJmpU6cyadIkXnvtNeMHk4YNG+Yohrvi4uJYtmwZPXv2ZNCgQcTHx/PFF18QEhLCoUOHcjUd38PDg2bNmrFu3TomT55scm7t2rVoNBpeeuklILOjmzVrFgMHDqRevXrExcVx5MgRjh07RuvWrXP1GODeB7L7nxeAl156icDAQGbOnGn84DhjxgwmTpxIt27dGDhwINHR0Xz66ac0bdqU48ePG6e7ffHFFwwePJiGDRsycuRI/v33X55//nlcXV3x9fXNNp6TJ0/SpEkTrK2tee211/Dz8+PixYv88MMPzJgxg86dO3Pu3Dm+/fZb5s+fT6lSpQCMf09PIkYhCiPpZ6WfzYt+9kHmnpfTp0/TqFEjfHx8GD9+PPb29qxbt45OnTqxceNGXnzxReDR+6uBAwfy9ddf8/LLL9OwYUN2795Nhw4dHin+uxYvXkzVqlV5/vnnsbKy4ocffmDo0KEYDAbeeOMNi7fLyfP9IOnT75E+vQAp4qm3YsUKBVAOHz5ssU54eLgCKCtWrDCW9e3bVwGUqVOnmtStXbu2UrduXePxb7/9pgDK6tWrTept27YtS3lSUlKW+x48eLBiZ2enpKSkGMuaNWumAEpoaGiOHuPkyZMVQDl79qwSHR2tXLp0SVm+fLlia2uruLm5KYmJiYqiKIrBYFACAwOVkJAQxWAwmMTl7++vtG7d2ljWsWNHxc7OTrl27Zqx7Pz584qVlZXy4J8WoKjVauX06dMm5a+++qri5eWl3Lx506S8R48eirOzs/H38cILLyhVq1bN9jE6Ozsrb7zxRrZ1+vbtq5QtW9Z4vGXLFgVQpk+fblKva9euikqlUi5cuGDyGLRarUnZn3/+qQDKp59+muXxNmvWLNtY7jpy5IgCKDt27FAUJfM5KF26tDJixAiTepMmTVIAZdOmTVnauPtcHT58OMvr9K6yZcsqffv2zVLerFkzk1gzMjKU1NRUkzp37txRPDw8lAEDBpiUA8rkyZOzfXxLlixRAOWvv/4yKa9SpYrSsmVL43HNmjWVDh06ZNuWOXv27FEAZfny5Up0dLRy/fp1ZevWrYqfn5+iUqmMf9d3/wZ69uxpcvtLly4pGo1GmTFjhkn5X3/9pVhZWRnL09LSFHd3d6VWrVomv5+lS5dmeb7NvV80bdpUcXR0VP777z+T+7n/72zu3LkKoISHh+d7jEI8SdLPSj9bUP3s3dfezp07lejoaOXKlSvKhg0bFDc3N0Wn0ylXrlwx1m3VqpVSvXp1k9eBwWBQGjZsqAQGBhrLctJf3X093HXixAkFUIY
"text/plain": [
"<Figure size 1200x1000 with 6 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"_, ax = plt.subplots(3, 2, figsize=(12, 10), sharex=False, sharey=False)\n",
"ax = ax.flatten()\n",
"\n",
"for index, (name, model) in enumerate(best_models_reg.items()):\n",
" y_pred_reg = model.predict(X_test_reg)\n",
"\n",
" # График фактических значений против предсказанных значений\n",
" ax[index * 2].scatter(y_test_reg, y_pred_reg, alpha=0.5)\n",
" ax[index * 2].plot([min(y_test_reg), max(y_test_reg)], [min(y_test_reg), max(y_test_reg)], color='red', linestyle='--')\n",
" ax[index * 2].set_xlabel('Actual Values')\n",
" ax[index * 2].set_ylabel('Predicted Values')\n",
" ax[index * 2].set_title(f'{name}: Actual vs Predicted')\n",
"\n",
" # График остатков\n",
" residuals = y_test_reg - y_pred_reg\n",
" ax[index * 2 + 1].scatter(y_pred_reg, residuals, alpha=0.5)\n",
" ax[index * 2 + 1].axhline(y=0, color='red', linestyle='--')\n",
" ax[index * 2 + 1].set_xlabel('Predicted Values')\n",
" ax[index * 2 + 1].set_ylabel('Residuals')\n",
" ax[index * 2 + 1].set_title(f'{name}: Residuals vs Predicted')\n",
"\n",
"\n",
"plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.1)\n",
"plt.show()"
2024-11-19 19:06:25 +04:00
]
}
],
"metadata": {
"kernelspec": {
"display_name": "aimvenv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}