1947 lines
334 KiB
Plaintext
Raw Normal View History

2024-12-17 15:42:49 +04:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Начало лабораторной работы"
]
},
{
"cell_type": "code",
"execution_count": 130,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 235 entries, 1 to 235\n",
"Data columns (total 11 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Country (or dependency) 235 non-null object \n",
" 1 Population2020 235 non-null int64 \n",
" 2 Yearly Change 235 non-null float64\n",
" 3 NetChange 235 non-null int64 \n",
" 4 Density 235 non-null object \n",
" 5 LandArea 235 non-null int64 \n",
" 6 Migrants (net) 201 non-null object \n",
" 7 Fert. Rate 235 non-null object \n",
" 8 Med. Age 235 non-null object \n",
" 9 Urban Pop % 235 non-null object \n",
" 10 World Share 235 non-null object \n",
"dtypes: float64(1), int64(3), object(7)\n",
"memory usage: 22.0+ KB\n"
]
}
],
"source": [
"import pandas as pd\n",
"\n",
"df = pd.read_csv(\n",
" \".//static//csv///world-population-by-country-2020.csv\", index_col=\"no\"\n",
")\n",
"\n",
"df[\"Population2020\"] = df[\"Population2020\"].apply(lambda x: int(\"\".join(x.split(\",\"))))\n",
"df[\"NetChange\"] = df[\"NetChange\"].apply(lambda x: int(\"\".join(x.split(\",\"))))\n",
"df[\"Yearly Change\"] = df[\"Yearly Change\"].apply(lambda x: float(\"\".join(x.rstrip(\"%\"))))\n",
"df[\"LandArea\"] = df[\"LandArea\"].apply(lambda x: int(\"\".join(x.split(\",\"))))\n",
"\n",
"df.info()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Бизнес-цели"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"1. Прогнозирование популяции в странах:\n",
"\n",
"Цель: Разработать модель, которая будет предсказывать популяцию в странах\n",
"Применение:\n",
"Погнозировать глобальную мировую экономику\n",
"\n",
"2. Оптимизация характеристик:\n",
"Цель: Определить оптимальные характеристик по увеличению популяции.\n",
"Применение:\n",
"Понять, какие характеристики соответствуют увеличению популяцию\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"1. Прогнозирование популяции"
]
},
{
"cell_type": "code",
"execution_count": 131,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Среднее значение поля 'Population2020: 33171202.680851065\n",
" Country (or dependency) Population2020 Yearly Change NetChange Density \\\n",
"no \n",
"1 China 1439323776 0.39 5540090 153 \n",
"2 India 1380004385 0.99 13586631 464 \n",
"3 United States 331002651 0.59 1937734 36 \n",
"4 Indonesia 273523615 1.07 2898047 151 \n",
"5 Pakistan 220892340 2.00 4327022 287 \n",
"\n",
" LandArea Migrants (net) Fert. Rate Med. Age Urban Pop % World Share \\\n",
"no \n",
"1 9388211 -348,399 1.7 38 61% 18.47% \n",
"2 2973190 -532,687 2.2 28 35% 17.70% \n",
"3 9147420 954,806 1.8 38 83% 4.25% \n",
"4 1811570 -98,955 2.3 30 56% 3.51% \n",
"5 770880 -233,379 3.6 23 35% 2.83% \n",
"\n",
" above_average_Population2020 Population2020_volatility \n",
"no \n",
"1 1 1439322975 \n",
"2 1 1439322975 \n",
"3 1 1439322975 \n",
"4 1 1439322975 \n",
"5 1 1439322975 \n"
]
}
],
"source": [
"import pandas as pd\n",
"\n",
"df = pd.read_csv(\n",
" \".//static//csv///world-population-by-country-2020.csv\", index_col=\"no\"\n",
")\n",
"\n",
"df[\"Population2020\"] = df[\"Population2020\"].apply(lambda x: int(\"\".join(x.split(\",\"))))\n",
"df[\"NetChange\"] = df[\"NetChange\"].apply(lambda x: int(\"\".join(x.split(\",\"))))\n",
"df[\"Yearly Change\"] = df[\"Yearly Change\"].apply(lambda x: float(\"\".join(x.rstrip(\"%\"))))\n",
"df[\"LandArea\"] = df[\"LandArea\"].apply(lambda x: int(\"\".join(x.split(\",\"))))\n",
"\n",
"# Устанавливаем случайное состояние\n",
"random_state = 28\n",
"\n",
"# Рассчитываем среднее значение популяции\n",
"average_count = df['Population2020'].mean()\n",
"print(f\"Среднее значение поля 'Population2020: {average_count}\")\n",
"\n",
"# Создаем новую переменную, указывающую, превышает ли популяция среднее\n",
"df[\"above_average_Population2020\"] = (df[\"Population2020\"] > average_count).astype(int)\n",
"\n",
"# Рассчитываем волатильность (разницу между максимальной и минимальной популяцией)\n",
"df[\"Population2020_volatility\"] = df[\"Population2020\"].max() - df[\"Population2020\"].min()\n",
"\n",
"# Выводим первые строки измененной таблицы для проверки\n",
"print(df.head())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"2. Оптимизация параметров магазина:"
]
},
{
"cell_type": "code",
"execution_count": 132,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Средняя популяция для 'NetChange':\n",
"NetChange\n",
"-383840 1.264765e+08\n",
"-259876 4.373376e+07\n",
"-126866 1.923769e+07\n",
"-88249 6.046183e+07\n",
"-79889 2.843594e+07\n",
" ... \n",
" 2898047 2.735236e+08\n",
" 4327022 2.208923e+08\n",
" 5175990 2.061396e+08\n",
" 5540090 1.439324e+09\n",
" 13586631 1.380004e+09\n",
"Name: Population2020, Length: 234, dtype: float64\n",
"\n",
"Средняя популяция для 'Yearly Change':\n",
"Yearly Change\n",
"-2.47 2860853.0\n",
"-1.69 11239.0\n",
"-1.35 2722289.0\n",
"-1.08 1886198.0\n",
"-0.74 6948445.0\n",
" ... \n",
" 3.27 32866272.0\n",
" 3.32 45741007.0\n",
" 3.47 1402985.0\n",
" 3.68 1701575.0\n",
" 3.84 24206644.0\n",
"Name: Population2020, Length: 174, dtype: float64\n",
"\n",
"Средняя популяция для 'LandArea':\n",
"LandArea\n",
"0 8.010000e+02\n",
"1 3.924200e+04\n",
"10 1.752400e+04\n",
"20 1.082400e+04\n",
"21 9.877000e+03\n",
" ... \n",
"8358140 2.125594e+08\n",
"9093510 3.774215e+07\n",
"9147420 3.310027e+08\n",
"9388211 1.439324e+09\n",
"16376870 1.459345e+08\n",
"Name: Population2020, Length: 226, dtype: float64\n",
"\n",
"Средняя популяция для 'Density':\n",
"Density\n",
"0 30125.0\n",
"1,246 62278.0\n",
"1,261 42876.0\n",
"1,265 164689383.0\n",
"1,380 441543.0\n",
" ... \n",
"93 40222493.0\n",
"94 50263037.0\n",
"95 17109811.5\n",
"96 71986.0\n",
"99 32365999.0\n",
"Name: Population2020, Length: 165, dtype: float64\n",
"\n",
"Средняя популяция для комбинации 'NetChange' и 'LandArea':\n",
"NetChange LandArea\n",
"-383840 364555 1.264765e+08\n",
"-259876 579320 4.373376e+07\n",
"-126866 230170 1.923769e+07\n",
"-88249 294140 6.046183e+07\n",
"-79889 882050 2.843594e+07\n",
" ... \n",
" 2898047 1811570 2.735236e+08\n",
" 4327022 770880 2.208923e+08\n",
" 5175990 910770 2.061396e+08\n",
" 5540090 9388211 1.439324e+09\n",
" 13586631 2973190 1.380004e+09\n",
"Name: Population2020, Length: 235, dtype: float64\n",
"\n",
"Средняя популяция для комбинации 'LandArea' и 'Density':\n",
"LandArea Density\n",
"0 2,003 8.010000e+02\n",
"1 26,337 3.924200e+04\n",
"10 136 1.357000e+03\n",
" 3,369 3.369100e+04\n",
"20 541 1.082400e+04\n",
" ... \n",
"8358140 25 2.125594e+08\n",
"9093510 4 3.774215e+07\n",
"9147420 36 3.310027e+08\n",
"9388211 153 1.439324e+09\n",
"16376870 9 1.459345e+08\n",
"Name: Population2020, Length: 235, dtype: float64\n",
"\n"
]
}
],
"source": [
"import pandas as pd\n",
"\n",
"# Загружаем набор данных\n",
"df = pd.read_csv(\n",
" \".//static//csv///world-population-by-country-2020.csv\", index_col=\"no\"\n",
")\n",
"\n",
"df[\"Population2020\"] = df[\"Population2020\"].apply(lambda x: int(\"\".join(x.split(\",\"))))\n",
"df[\"NetChange\"] = df[\"NetChange\"].apply(lambda x: int(\"\".join(x.split(\",\"))))\n",
"df[\"Yearly Change\"] = df[\"Yearly Change\"].apply(lambda x: float(\"\".join(x.rstrip(\"%\"))))\n",
"df[\"LandArea\"] = df[\"LandArea\"].apply(lambda x: int(\"\".join(x.split(\",\"))))\n",
"\n",
"# Устанавливаем случайное состояние\n",
"random_state = 42\n",
"\n",
"# Рассчитываем среднюю популяцию для каждого значения каждого признака\n",
"for column in [\"NetChange\", \"Yearly Change\", \"LandArea\", \"Density\"]:\n",
" print(f\"Средняя популяция для '{column}':\")\n",
" print(df.groupby(column)[\"Population2020\"].mean())\n",
" print()\n",
"\n",
"\n",
"print(\"Средняя популяция для комбинации 'NetChange' и 'LandArea':\")\n",
"print(df.groupby([\"NetChange\", \"LandArea\"])[\"Population2020\"].mean())\n",
"print()\n",
"\n",
"\n",
"print(\"Средняя популяция для комбинации 'LandArea' и 'Density':\")\n",
"print(df.groupby([\"LandArea\", \"Density\"])[\"Population2020\"].mean())\n",
"print()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Выбор ориентира:\n",
"1. Прогнозирование стоимости акций взносов:\n",
"Ориентир:\n",
"\n",
"R² (коэффициент детерминации): 0.75 - 0.85\n",
"\n",
"MAE (средняя абсолютная ошибка): 1000000 - 1500000 продаж\n",
"\n",
"RMSE (среднеквадратичная ошибка): 1200000 - 1600000 продаж"
]
},
{
"cell_type": "code",
"execution_count": 133,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"MAE: 18325226.004086882\n",
"MSE: 1372712018411057.2\n",
"RMSE: 37050128.453367844\n",
"R²: -0.22943866941880264\n",
"Ориентиры для прогнозирования не достигнуты.\n",
"Средняя популяция 'LandArea':\n",
"LandArea\n",
"0 8.010000e+02\n",
"1 3.924200e+04\n",
"10 1.752400e+04\n",
"20 1.082400e+04\n",
"21 9.877000e+03\n",
" ... \n",
"8358140 2.125594e+08\n",
"9093510 3.774215e+07\n",
"9147420 3.310027e+08\n",
"9388211 1.439324e+09\n",
"16376870 1.459345e+08\n",
"Name: Population2020, Length: 226, dtype: float64\n",
"\n",
"Средняя популяция 'NetChange':\n",
"NetChange\n",
"-383840 1.264765e+08\n",
"-259876 4.373376e+07\n",
"-126866 1.923769e+07\n",
"-88249 6.046183e+07\n",
"-79889 2.843594e+07\n",
" ... \n",
" 2898047 2.735236e+08\n",
" 4327022 2.208923e+08\n",
" 5175990 2.061396e+08\n",
" 5540090 1.439324e+09\n",
" 13586631 1.380004e+09\n",
"Name: Population2020, Length: 234, dtype: float64\n",
"\n",
"Средняя популяция 'Density':\n",
"Density\n",
"0 3.012500e+04\n",
"2 1.937814e+06\n",
"3 9.460677e+06\n",
"4 8.106156e+06\n",
"5 4.649658e+06\n",
" ... \n",
"3369 3.369100e+04\n",
"7140 7.496981e+06\n",
"8358 5.850342e+06\n",
"21645 6.493350e+05\n",
"26337 3.924200e+04\n",
"Name: Population2020, Length: 165, dtype: float64\n",
"\n",
"NetChange LandArea\n",
"-383840 364555 1.264765e+08\n",
"-259876 579320 4.373376e+07\n",
"-126866 230170 1.923769e+07\n",
"-88249 294140 6.046183e+07\n",
"-79889 882050 2.843594e+07\n",
" ... \n",
" 2898047 1811570 2.735236e+08\n",
" 4327022 770880 2.208923e+08\n",
" 5175990 910770 2.061396e+08\n",
" 5540090 9388211 1.439324e+09\n",
" 13586631 2973190 1.380004e+09\n",
"Name: Population2020, Length: 235, dtype: float64\n",
"\n",
"LandArea Density\n",
"0 2003 8.010000e+02\n",
"1 26337 3.924200e+04\n",
"10 136 1.357000e+03\n",
" 3369 3.369100e+04\n",
"20 541 1.082400e+04\n",
" ... \n",
"8358140 25 2.125594e+08\n",
"9093510 4 3.774215e+07\n",
"9147420 36 3.310027e+08\n",
"9388211 153 1.439324e+09\n",
"16376870 9 1.459345e+08\n",
"Name: Population2020, Length: 235, dtype: float64\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
" warnings.warn(\n"
]
}
],
"source": [
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
"\n",
"# Загружаем набор данных\n",
"df = pd.read_csv(\n",
" \".//static//csv///world-population-by-country-2020.csv\", index_col=\"no\"\n",
")\n",
"\n",
"df[\"Population2020\"] = df[\"Population2020\"].apply(lambda x: int(\"\".join(x.split(\",\"))))\n",
"df[\"NetChange\"] = df[\"NetChange\"].apply(lambda x: int(\"\".join(x.split(\",\"))))\n",
"df[\"Yearly Change\"] = df[\"Yearly Change\"].apply(lambda x: float(\"\".join(x.rstrip(\"%\"))))\n",
"df[\"LandArea\"] = df[\"LandArea\"].apply(lambda x: int(\"\".join(x.split(\",\"))))\n",
"df[\"Density\"] = df[\"Density\"].apply(lambda x: int(\"\".join(x.split(\",\"))))\n",
"\n",
"# Разделяем данные на признаки (X) и целевую переменную (y)\n",
"\n",
"X = df.drop(\n",
" columns=[\n",
" \"Population2020\",\n",
" \"Country (or dependency)\",\n",
" \"Migrants (net)\",\n",
" \"Fert. Rate\",\n",
" \"Med. Age\",\n",
" \"Urban Pop %\",\n",
" \"World Share\",\n",
" \"Density\",\n",
" ],\n",
" axis=1,\n",
")\n",
"\n",
"y = df[\"Population2020\"]\n",
"\n",
"# Разделяем данные на обучающую и тестовую выборки\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
"\n",
"# Стандартизируем признаки\n",
"scaler = StandardScaler()\n",
"X_train = scaler.fit_transform(X_train)\n",
"X_test = scaler.transform(X_test)\n",
"\n",
"# Обучаем модель линейной регрессии\n",
"model = LinearRegression()\n",
"model.fit(X_train, y_train)\n",
"\n",
"# Делаем предсказания на тестовой выборке\n",
"y_pred = model.predict(X_test)\n",
"\n",
"# Оцениваем качество модели\n",
"mae = mean_absolute_error(y_test, y_pred)\n",
"mse = mean_squared_error(y_test, y_pred)\n",
"rmse = mean_squared_error(y_test, y_pred, squared=False)\n",
"r2 = r2_score(y_test, y_pred)\n",
"\n",
"print(f\"MAE: {mae}\")\n",
"print(f\"MSE: {mse}\")\n",
"print(f\"RMSE: {rmse}\")\n",
"print(f\"R²: {r2}\")\n",
"\n",
"# Проверяем, достигнуты ли ориентиры\n",
"if r2 >= 0.75 and mae <= 1500000 and rmse <= 1700000:\n",
" print(\"Ориентиры для прогнозирования достигнуты!\")\n",
"else:\n",
" print(\"Ориентиры для прогнозирования не достигнуты.\")\n",
"\n",
"\n",
"columns_to_group = [\n",
"\n",
" \"LandArea\",\n",
" \"NetChange\", \"Density\"\n",
"]\n",
"\n",
"# Рассчитываем среднюю популяцию для каждого значения каждого признака\n",
"for column in columns_to_group:\n",
" print(f\"Средняя популяция '{column}':\")\n",
" print(df.groupby(column)[\"Population2020\"].mean())\n",
" print()\n",
"\n",
"# Рассчитываем средняя популяция для комбинаций признаков\n",
"\n",
"\n",
"print(df.groupby([\"NetChange\", \"LandArea\"])[\"Population2020\"].mean())\n",
"print()\n",
"\n",
"\n",
"print(df.groupby([\"LandArea\", \"Density\"])[\"Population2020\"].mean())\n",
"print()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Анализ применимости алгоритмов обучения с учителем для решения поставленных задач:\n",
"1. Прогнозирование популяции:\n",
"Задача: Регрессия\n",
"\n",
"Свойства алгоритмов:\n",
"\n",
"Линейная регрессия:\n",
"Применимость: Хорошо подходит для задач, где зависимость между признаками и целевой переменной линейна.\n",
"Преимущества: Проста в реализации, интерпретируема.\n",
"Недостатки: Может плохо работать, если зависимость нелинейна.\n",
"\n",
"Деревья решений (регрессия):\n",
"Применимость: Подходит для задач с нелинейными зависимостями.\n",
"Преимущества: Может обрабатывать категориальные признаки, не требует масштабирования данных.\n",
"Недостатки: Подвержены переобучению, могут давать нестабильные результаты.\n",
"\n",
"Случайный лес (регрессия):\n",
"Применимость: Хорошо подходит для задач с нелинейными зависимостями и большим количеством признаков.\n",
"Преимущества: Устойчив к переобучению, может обрабатывать категориальные признаки.\n",
"Недостатки: Менее интерпретируем, чем линейная регрессия.\n",
"\n",
"Градиентный бустинг (регрессия):\n",
"Применимость: Подходит для задач с нелинейными зависимостями и сложными взаимосвязями между признаками.\n",
"Преимущества: Может достигать высокой точности, устойчив к переобучению.\n",
"Недостатки: Сложнее в настройке, чем случайный лес, менее интерпретируем.\n",
"\n",
"Нейронные сети (регрессия):\n",
"Применимость: Подходит для задач с очень сложными зависимостями и большим количеством данных.\n",
"Преимущества: Может моделировать очень сложные зависимости.\n",
"Недостатки: Требует большого количества данных, сложнее в настройке и интерпретации.\n",
"\n",
"Вывод:\n",
"\n",
"Линейная регрессия: Может быть хорошим выбором для начала, особенно если зависимость между признаками и целевой переменной линейна.\n",
"\n",
"Деревья решений и случайный лес: Подходят для задач с нелинейными зависимостями.\n",
"\n",
"Градиентный бустинг: Может давать более высокую точность, чем случайный лес, но требует больше времени на настройку.\n",
"\n",
"Нейронные сети: Могут быть излишними для этой задачи, если данных недостаточно много.\n",
"\n",
"2. Оптимизация характеристик:\n",
"Задача: Классификация\n",
"\n",
"Свойства алгоритмов:\n",
"\n",
"Логистическая регрессия:\n",
"Применимость: Хорошо подходит для задач бинарной классификации, где зависимость между признаками и целевой переменной линейна.\n",
"Преимущества: Проста в реализации, интерпретируема.\n",
"Недостатки: Может плохо работать, если зависимость нелинейна.\n",
"\n",
"Деревья решений (классификация):\n",
"Применимость: Подходит для задач с нелинейными зависимостями.\n",
"Преимущества: Может обрабатывать категориальные признаки, не требует масштабирования данных.\n",
"Недостатки: Подвержены переобучению, могут давать нестабильные результаты.\n",
"\n",
"Случайный лес (классификация):\n",
"Применимость: Хорошо подходит для задач с нелинейными зависимостями и большим количеством признаков.\n",
"Преимущества: Устойчив к переобучению, может обрабатывать категориальные признаки.\n",
"Недостатки: Менее интерпретируем, чем линейная регрессия.\n",
"\n",
"Градиентный бустинг (классификация):\n",
"Применимость: Подходит для задач с нелинейными зависимостями и сложными взаимосвязями между признаками.\n",
"Преимущества: Может достигать высокой точности, устойчив к переобучению.\n",
"Недостатки: Сложнее в настройке, чем случайный лес, менее интерпретируем.\n",
"\n",
"Нейронные сети (классификация):\n",
"Применимость: Подходит для задач с очень сложными зависимостями и большим количеством данных.\n",
"Преимущества: Может моделировать очень сложные зависимости.\n",
"Недостатки: Требует большого количества данных, сложнее в настройке и интерпретации.\n",
"\n",
"Вывод:\n",
"\n",
"Логистическая регрессия: Может быть хорошим выбором для начала, особенно если зависимость между признаками и целевой переменной линейна.\n",
"\n",
"Деревья решений и случайный лес: Подходят для задач с нелинейными зависимостями.\n",
"\n",
"Градиентный бустинг: Может давать более высокую точность, чем случайный лес, но требует больше времени на настройку.\n",
"\n",
"Нейронные сети: Могут быть излишними для этой задачи, если данных недостаточно много.\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"1. Прогнозирование стоимости акций:\n",
"Выбранные модели:\n",
"\n",
"Линейная регрессия\n",
"\n",
"Случайный лес (регрессия)\n",
"\n",
"Градиентный бустинг (регрессия)\n",
"\n",
"2. Оптимизация тарифной сетки:\n",
"Выбранные модели:\n",
"\n",
"Логистическая регрессия\n",
"\n",
"Случайный лес (классификация)\n",
"\n",
"Градиентный бустинг (классификация)"
]
},
{
"cell_type": "code",
"execution_count": 134,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Результаты для задачи регрессии:\n",
"Model: Linear Regression\n",
"MAE: 18325226.004086882\n",
"MSE: 1372712018411057.2\n",
"RMSE: 37050128.453367844\n",
"R²: -0.22943866941880264\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
" warnings.warn(\n",
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
" warnings.warn(\n",
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model: Random Forest Regression\n",
"MAE: 8668932.614893615\n",
"MSE: 406442910254215.25\n",
"RMSE: 20160429.31720987\n",
"R²: 0.63597854169292\n",
"\n",
"Model: Gradient Boosting Regression\n",
"MAE: 8278621.445813501\n",
"MSE: 351479677103981.1\n",
"RMSE: 18747791.259345222\n",
"R²: 0.6852051262385974\n",
"\n",
"Результаты для задачи классификации:\n",
"Model: Logistic Regression\n",
"Accuracy: 0.8936170212765957\n",
"\n",
"Model: Random Forest Classification\n",
"Accuracy: 0.9574468085106383\n",
"\n",
"Model: Gradient Boosting Classification\n",
"Accuracy: 0.8936170212765957\n",
"\n"
]
}
],
"source": [
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.linear_model import LinearRegression, LogisticRegression\n",
"from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n",
"from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier\n",
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score\n",
"\n",
"# Загружаем набор данных\n",
"df = pd.read_csv(\n",
" \".//static//csv///world-population-by-country-2020.csv\", index_col=\"no\"\n",
")\n",
"\n",
"df[\"Population2020\"] = df[\"Population2020\"].apply(lambda x: int(\"\".join(x.split(\",\"))))\n",
"df[\"NetChange\"] = df[\"NetChange\"].apply(lambda x: int(\"\".join(x.split(\",\"))))\n",
"df[\"Yearly Change\"] = df[\"Yearly Change\"].apply(lambda x: float(\"\".join(x.rstrip(\"%\"))))\n",
"df[\"LandArea\"] = df[\"LandArea\"].apply(lambda x: int(\"\".join(x.split(\",\"))))\n",
"\n",
"\n",
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи регрессии\n",
"X_reg = df.drop(\n",
" columns=[\n",
" \"Population2020\",\n",
" \"Country (or dependency)\",\n",
" \"Migrants (net)\",\n",
" \"Fert. Rate\",\n",
" \"Med. Age\",\n",
" \"Urban Pop %\",\n",
" \"World Share\",\n",
" \"Density\",\n",
" ],\n",
" axis=1,\n",
")\n",
"y_reg = df[\"Population2020\"]\n",
"\n",
"# Разделяем данные на обучающую и тестовую выборки для задачи регрессии\n",
"X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)\n",
"\n",
"# Стандартизируем признаки для задачи регрессии\n",
"scaler_reg = StandardScaler()\n",
"X_train_reg = scaler_reg.fit_transform(X_train_reg)\n",
"X_test_reg = scaler_reg.transform(X_test_reg)\n",
"\n",
"# Список моделей для задачи регрессии\n",
"models_reg = {\n",
" \"Linear Regression\": LinearRegression(),\n",
" \"Random Forest Regression\": RandomForestRegressor(),\n",
" \"Gradient Boosting Regression\": GradientBoostingRegressor()\n",
"}\n",
"\n",
"# Обучаем и оцениваем модели для задачи регрессии\n",
"print(\"Результаты для задачи регрессии:\")\n",
"for name, model in models_reg.items():\n",
" model.fit(X_train_reg, y_train_reg)\n",
" y_pred_reg = model.predict(X_test_reg)\n",
" mae = mean_absolute_error(y_test_reg, y_pred_reg)\n",
" mse = mean_squared_error(y_test_reg, y_pred_reg)\n",
" rmse = mean_squared_error(y_test_reg, y_pred_reg, squared=False)\n",
" r2 = r2_score(y_test_reg, y_pred_reg)\n",
" print(f\"Model: {name}\")\n",
" print(f\"MAE: {mae}\")\n",
" print(f\"MSE: {mse}\")\n",
" print(f\"RMSE: {rmse}\")\n",
" print(f\"R²: {r2}\")\n",
" print()\n",
"\n",
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи классификации\n",
"X_class = df.drop(\n",
" columns=[\n",
" \"Population2020\",\n",
" \"Country (or dependency)\",\n",
" \"Migrants (net)\",\n",
" \"Fert. Rate\",\n",
" \"Med. Age\",\n",
" \"Urban Pop %\",\n",
" \"World Share\",\n",
" \"Density\",\n",
" ],\n",
" axis=1,\n",
")\n",
"y_class = (df[\"Population2020\"] > df[\"Population2020\"].mean()).astype(int)\n",
"\n",
"# Разделяем данные на обучающую и тестовую выборки для задачи классификации\n",
"X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)\n",
"\n",
"# Стандартизируем признаки для задачи классификации\n",
"scaler_class = StandardScaler()\n",
"X_train_class = scaler_class.fit_transform(X_train_class)\n",
"X_test_class = scaler_class.transform(X_test_class)\n",
"\n",
"# Список моделей для задачи классификации\n",
"models_class = {\n",
" \"Logistic Regression\": LogisticRegression(),\n",
" \"Random Forest Classification\": RandomForestClassifier(),\n",
" \"Gradient Boosting Classification\": GradientBoostingClassifier()\n",
"}\n",
"\n",
"# Обучаем и оцениваем модели для задачи классификации\n",
"print(\"Результаты для задачи классификации:\")\n",
"for name, model in models_class.items():\n",
" model.fit(X_train_class, y_train_class)\n",
" y_pred_class = model.predict(X_test_class)\n",
" accuracy = accuracy_score(y_test_class, y_pred_class)\n",
" print(f\"Model: {name}\")\n",
" print(f\"Accuracy: {accuracy}\")\n",
" print()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"1. Прогнозирование стоимости акций:\n",
"Конвейер для задачи регрессии:"
]
},
{
"cell_type": "code",
"execution_count": 135,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Результаты для задачи регрессии:\n",
"Model: Linear Regression\n",
"MAE: 18355084.060291413\n",
"MSE: 1373553690261338.5\n",
"RMSE: 37061485.26788071\n",
"R²: -0.23019249389605534\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
" warnings.warn(\n",
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model: Random Forest Regression\n",
"MAE: 8360523.173191491\n",
"MSE: 433344291612551.56\n",
"RMSE: 20816923.20235033\n",
"R²: 0.6118849240519788\n",
"\n",
"Model: Gradient Boosting Regression\n",
"MAE: 7374600.833694444\n",
"MSE: 279881158669974.12\n",
"RMSE: 16729649.089863604\n",
"R²: 0.7493307301928449\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
" warnings.warn(\n"
]
}
],
"source": [
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.preprocessing import OneHotEncoder\n",
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
"\n",
"# Загружаем набор данных\n",
"df = pd.read_csv(\n",
" \".//static//csv///world-population-by-country-2020.csv\", index_col=\"no\"\n",
")\n",
"\n",
"df[\"Population2020\"] = df[\"Population2020\"].apply(lambda x: int(\"\".join(x.split(\",\"))))\n",
"df[\"NetChange\"] = df[\"NetChange\"].apply(lambda x: int(\"\".join(x.split(\",\"))))\n",
"df[\"Yearly Change\"] = df[\"Yearly Change\"].apply(lambda x: float(\"\".join(x.rstrip(\"%\"))))\n",
"df[\"LandArea\"] = df[\"LandArea\"].apply(lambda x: int(\"\".join(x.split(\",\"))))\n",
"df[\"Density\"] = df[\"Density\"].apply(lambda x: int(\"\".join(x.split(\",\"))))\n",
"\n",
"\n",
"numerical_cols = [\"NetChange\", \"Yearly Change\", \"LandArea\", \"Density\"]\n",
"\n",
"\n",
"preprocessor = ColumnTransformer(\n",
" transformers=[\n",
" ('num', StandardScaler(), numerical_cols)\n",
" ])\n",
"\n",
"# Список моделей для задачи регрессии\n",
"models_reg = {\n",
" \"Linear Regression\": LinearRegression(),\n",
" \"Random Forest Regression\": RandomForestRegressor(),\n",
" \"Gradient Boosting Regression\": GradientBoostingRegressor()\n",
"}\n",
"\n",
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи регрессии\n",
"X_reg = df[numerical_cols]\n",
"y_reg = df[\"Population2020\"]\n",
"\n",
"# Разделяем данные на обучающую и тестовую выборки для задачи регрессии\n",
"X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)\n",
"\n",
"# Обучаем и оцениваем модели для задачи регрессии\n",
"print(\"Результаты для задачи регрессии:\")\n",
"for name, model in models_reg.items():\n",
" pipeline = Pipeline(steps=[\n",
" ('preprocessor', preprocessor),\n",
" ('model', model)\n",
" ])\n",
" pipeline.fit(X_train_reg, y_train_reg)\n",
" y_pred_reg = pipeline.predict(X_test_reg)\n",
" mae = mean_absolute_error(y_test_reg, y_pred_reg)\n",
" mse = mean_squared_error(y_test_reg, y_pred_reg)\n",
" rmse = mean_squared_error(y_test_reg, y_pred_reg, squared=False)\n",
" r2 = r2_score(y_test_reg, y_pred_reg)\n",
" print(f\"Model: {name}\")\n",
" print(f\"MAE: {mae}\")\n",
" print(f\"MSE: {mse}\")\n",
" print(f\"RMSE: {rmse}\")\n",
" print(f\"R²: {r2}\")\n",
" print()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"2. Оптимизация характеристик магазина:\n",
"Конвейер для задачи классификации:"
]
},
{
"cell_type": "code",
"execution_count": 136,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Результаты для задачи классификации:\n",
"Model: Logistic Regression\n",
"Accuracy: 0.8936170212765957\n",
"\n",
"Model: Random Forest Classification\n",
"Accuracy: 0.9361702127659575\n",
"\n",
"Model: Gradient Boosting Classification\n",
"Accuracy: 0.9148936170212766\n",
"\n"
]
}
],
"source": [
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.preprocessing import OneHotEncoder\n",
"from sklearn.metrics import accuracy_score\n",
"\n",
"# Загружаем набор данных\n",
"df = pd.read_csv(\n",
" \".//static//csv///world-population-by-country-2020.csv\", index_col=\"no\"\n",
")\n",
"\n",
"df[\"Population2020\"] = df[\"Population2020\"].apply(lambda x: int(\"\".join(x.split(\",\"))))\n",
"df[\"NetChange\"] = df[\"NetChange\"].apply(lambda x: int(\"\".join(x.split(\",\"))))\n",
"df[\"Yearly Change\"] = df[\"Yearly Change\"].apply(lambda x: float(\"\".join(x.rstrip(\"%\"))))\n",
"df[\"LandArea\"] = df[\"LandArea\"].apply(lambda x: int(\"\".join(x.split(\",\"))))\n",
"df[\"Density\"] = df[\"Density\"].apply(lambda x: int(\"\".join(x.split(\",\"))))\n",
"\n",
"numerical_cols = [\"NetChange\", \"Yearly Change\", \"LandArea\", \"Density\"]\n",
"# Создаем преобразователь для категориальных и числовых столбцов\n",
"preprocessor = ColumnTransformer(\n",
" transformers=[\n",
" ('num', StandardScaler(), numerical_cols)\n",
" ])\n",
"\n",
"# Список моделей для задачи классификации\n",
"models_class = {\n",
" \"Logistic Regression\": LogisticRegression(),\n",
" \"Random Forest Classification\": RandomForestClassifier(),\n",
" \"Gradient Boosting Classification\": GradientBoostingClassifier()\n",
"}\n",
"\n",
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи классификации\n",
"X_class = df[numerical_cols]\n",
"y_class = (df[\"Population2020\"] > df[\"Population2020\"].mean()).astype(int)\n",
"\n",
"# Разделяем данные на обучающую и тестовую выборки для задачи классификации\n",
"X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)\n",
"\n",
"# Обучаем и оцениваем модели для задачи классификации\n",
"print(\"Результаты для задачи классификации:\")\n",
"for name, model in models_class.items():\n",
" pipeline = Pipeline(steps=[\n",
" ('preprocessor', preprocessor),\n",
" ('model', model)\n",
" ])\n",
" pipeline.fit(X_train_class, y_train_class)\n",
" y_pred_class = pipeline.predict(X_test_class)\n",
" accuracy = accuracy_score(y_test_class, y_pred_class)\n",
" print(f\"Model: {name}\")\n",
" print(f\"Accuracy: {accuracy}\")\n",
" print()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"1. Прогнозирование популяции:\n",
"\n",
"Настройка гиперпараметров для задачи регрессии:"
]
},
{
"cell_type": "code",
"execution_count": 137,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Результаты для задачи регрессии:\n",
"Model: Linear Regression\n",
"Best Parameters: {}\n",
"MAE: 18355084.060291413\n",
"MSE: 1373553690261338.5\n",
"RMSE: 37061485.26788071\n",
"R²: -0.23019249389605534\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
" warnings.warn(\n",
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model: Random Forest Regression\n",
"Best Parameters: {'model__max_depth': None, 'model__n_estimators': 200}\n",
"MAE: 8240284.800957445\n",
"MSE: 400813836952793.8\n",
"RMSE: 20020335.585418988\n",
"R²: 0.6410200946894263\n",
"\n",
"Model: Gradient Boosting Regression\n",
"Best Parameters: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 200}\n",
"MAE: 7008860.588160669\n",
"MSE: 275375287564661.78\n",
"RMSE: 16594435.439769013\n",
"R²: 0.7533663123848768\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
" warnings.warn(\n"
]
}
],
"source": [
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split, GridSearchCV\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.preprocessing import OneHotEncoder\n",
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
"\n",
"# Загружаем набор данных\n",
"df = pd.read_csv(\n",
" \".//static//csv///world-population-by-country-2020.csv\", index_col=\"no\"\n",
")\n",
"\n",
"df[\"Population2020\"] = df[\"Population2020\"].apply(lambda x: int(\"\".join(x.split(\",\"))))\n",
"df[\"NetChange\"] = df[\"NetChange\"].apply(lambda x: int(\"\".join(x.split(\",\"))))\n",
"df[\"Yearly Change\"] = df[\"Yearly Change\"].apply(lambda x: float(\"\".join(x.rstrip(\"%\"))))\n",
"df[\"LandArea\"] = df[\"LandArea\"].apply(lambda x: int(\"\".join(x.split(\",\"))))\n",
"df[\"Density\"] = df[\"Density\"].apply(lambda x: int(\"\".join(x.split(\",\"))))\n",
"\n",
"# Определяем категориальные и числовые столбцы\n",
"\n",
"numerical_cols = [\"NetChange\", \"Yearly Change\", \"LandArea\", \"Density\"]\n",
"\n",
"# Создаем преобразователь для категориальных и числовых столбцов\n",
"preprocessor = ColumnTransformer(\n",
" transformers=[\n",
" ('num', StandardScaler(), numerical_cols)\n",
" ])\n",
"\n",
"# Список моделей и их гиперпараметров для задачи регрессии\n",
"models_reg = {\n",
" \"Linear Regression\": (LinearRegression(), {}),\n",
" \"Random Forest Regression\": (RandomForestRegressor(), {\n",
" 'model__n_estimators': [100, 200],\n",
" 'model__max_depth': [None, 10, 20]\n",
" }),\n",
" \"Gradient Boosting Regression\": (GradientBoostingRegressor(), {\n",
" 'model__n_estimators': [100, 200],\n",
" 'model__learning_rate': [0.01, 0.1],\n",
" 'model__max_depth': [3, 5]\n",
" })\n",
"}\n",
"\n",
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи регрессии\n",
"X_reg = df[numerical_cols]\n",
"y_reg = df['Population2020']\n",
"\n",
"# Разделяем данные на обучающую и тестовую выборки для задачи регрессии\n",
"X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)\n",
"\n",
"# Обучаем и оцениваем модели для задачи регрессии\n",
"print(\"Результаты для задачи регрессии:\")\n",
"for name, (model, params) in models_reg.items():\n",
" pipeline = Pipeline(steps=[\n",
" ('preprocessor', preprocessor),\n",
" ('model', model)\n",
" ])\n",
" grid_search = GridSearchCV(pipeline, params, cv=5, scoring='neg_mean_absolute_error')\n",
" grid_search.fit(X_train_reg, y_train_reg)\n",
" best_model = grid_search.best_estimator_\n",
" y_pred_reg = best_model.predict(X_test_reg)\n",
" mae = mean_absolute_error(y_test_reg, y_pred_reg)\n",
" mse = mean_squared_error(y_test_reg, y_pred_reg)\n",
" rmse = mean_squared_error(y_test_reg, y_pred_reg, squared=False)\n",
" r2 = r2_score(y_test_reg, y_pred_reg)\n",
" print(f\"Model: {name}\")\n",
" print(f\"Best Parameters: {grid_search.best_params_}\")\n",
" print(f\"MAE: {mae}\")\n",
" print(f\"MSE: {mse}\")\n",
" print(f\"RMSE: {rmse}\")\n",
" print(f\"R²: {r2}\")\n",
" print()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"2. Оптимизация характеристик:\n",
"\n",
"Настройка гиперпараметров для задачи классификации:"
]
},
{
"cell_type": "code",
"execution_count": 138,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Результаты для задачи классификации:\n",
"Model: Logistic Regression\n",
"Best Parameters: {'model__C': 1, 'model__solver': 'liblinear'}\n",
"Accuracy: 0.8936170212765957\n",
"\n",
"Model: Random Forest Classification\n",
"Best Parameters: {'model__max_depth': None, 'model__n_estimators': 200}\n",
"Accuracy: 0.9574468085106383\n",
"\n",
"Model: Gradient Boosting Classification\n",
"Best Parameters: {'model__learning_rate': 0.01, 'model__max_depth': 5, 'model__n_estimators': 200}\n",
"Accuracy: 0.9148936170212766\n",
"\n"
]
}
],
"source": [
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split, GridSearchCV\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.preprocessing import OneHotEncoder\n",
"from sklearn.metrics import accuracy_score\n",
"\n",
"# Загружаем набор данных\n",
"df = pd.read_csv(\n",
" \".//static//csv///world-population-by-country-2020.csv\", index_col=\"no\"\n",
")\n",
"\n",
"df[\"Population2020\"] = df[\"Population2020\"].apply(lambda x: int(\"\".join(x.split(\",\"))))\n",
"df[\"NetChange\"] = df[\"NetChange\"].apply(lambda x: int(\"\".join(x.split(\",\"))))\n",
"df[\"Yearly Change\"] = df[\"Yearly Change\"].apply(lambda x: float(\"\".join(x.rstrip(\"%\"))))\n",
"df[\"LandArea\"] = df[\"LandArea\"].apply(lambda x: int(\"\".join(x.split(\",\"))))\n",
"df[\"Density\"] = df[\"Density\"].apply(lambda x: int(\"\".join(x.split(\",\"))))\n",
"\n",
"# Определяем категориальные и числовые столбцы\n",
"\n",
"numerical_cols = [\"NetChange\", \"Yearly Change\", \"LandArea\", \"Density\"]\n",
"\n",
"# Создаем преобразователь для категориальных и числовых столбцов\n",
"preprocessor = ColumnTransformer(\n",
" transformers=[\n",
" ('num', StandardScaler(), numerical_cols)\n",
" ])\n",
"\n",
"# Список моделей и их гиперпараметров для задачи классификации\n",
"models_class = {\n",
" \"Logistic Regression\": (LogisticRegression(), {\n",
" 'model__C': [0.1, 1, 10],\n",
" 'model__solver': ['liblinear', 'lbfgs']\n",
" }),\n",
" \"Random Forest Classification\": (RandomForestClassifier(), {\n",
" 'model__n_estimators': [100, 200],\n",
" 'model__max_depth': [None, 10, 20]\n",
" }),\n",
" \"Gradient Boosting Classification\": (GradientBoostingClassifier(), {\n",
" 'model__n_estimators': [100, 200],\n",
" 'model__learning_rate': [0.01, 0.1],\n",
" 'model__max_depth': [3, 5]\n",
" })\n",
"}\n",
"\n",
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи классификации\n",
"X_class = df[numerical_cols]\n",
"y_class = (df['Population2020'] > df['Population2020'].mean()).astype(int)\n",
"\n",
"# Разделяем данные на обучающую и тестовую выборки для задачи классификации\n",
"X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)\n",
"\n",
"# Обучаем и оцениваем модели для задачи классификации\n",
"print(\"Результаты для задачи классификации:\")\n",
"for name, (model, params) in models_class.items():\n",
" pipeline = Pipeline(steps=[\n",
" ('preprocessor', preprocessor),\n",
" ('model', model)\n",
" ])\n",
" grid_search = GridSearchCV(pipeline, params, cv=5, scoring='accuracy')\n",
" grid_search.fit(X_train_class, y_train_class)\n",
" best_model = grid_search.best_estimator_\n",
" y_pred_class = best_model.predict(X_test_class)\n",
" accuracy = accuracy_score(y_test_class, y_pred_class)\n",
" print(f\"Model: {name}\")\n",
" print(f\"Best Parameters: {grid_search.best_params_}\")\n",
" print(f\"Accuracy: {accuracy}\")\n",
" print()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"1. Прогнозирование популяции:\n",
"Задача: Регрессия\n",
"\n",
"Выбор метрик:\n",
"\n",
"MAE (Mean Absolute Error): Средняя абсолютная ошибка. Показывает среднее отклонение предсказанных значений от фактических. Эта метрика легко интерпретируется, так как она измеряется в тех же единицах, что и целевая переменная \n",
"\n",
"MSE (Mean Squared Error): Среднеквадратичная ошибка. Показывает среднее квадратичное отклонение предсказанных значений от фактических. Эта метрика чувствительна к выбросам, так как ошибки возводятся в квадрат.\n",
"\n",
"RMSE (Root Mean Squared Error): Квадратный корень из среднеквадратичной ошибки. Показывает среднее отклонение предсказанных значений от фактических в тех же единицах, что и целевая переменная. Эта метрика также чувствительна к выбросам, но легче интерпретируется, чем MSE.\n",
"\n",
"R² (R-squared): Коэффициент детерминации. Показывает, какую долю дисперсии целевой переменной объясняет модель. Значение R² близкое к 1 указывает на хорошее качество модели.\n",
"\n",
"Обоснование:\n",
"\n",
"MAE: Хорошо подходит для задач, где важно понимать среднее отклонение предсказаний от фактических значений.\n",
"\n",
"MSE и RMSE: Полезны для задач, где важно минимизировать влияние выбросов, так как они возводят ошибки в квадрат.\n",
"\n",
"R²: Позволяет оценить, насколько хорошо модель объясняет вариацию целевой переменной.\n",
"\n",
"2. Оптимизация характеристик:\n",
"Задача: Классификация\n",
"\n",
"Выбор метрик:\n",
"\n",
"Accuracy: Доля правильных предсказаний среди всех предсказаний. Эта метрика показывает общую точность модели.\n",
"\n",
"Precision: Доля правильных положительных предсказаний среди всех положительных предсказаний. Эта метрика важна, если важно минимизировать количество ложноположительных результатов.\n",
"\n",
"Recall (Sensitivity): Доля правильных положительных предсказаний среди всех фактических положительных случаев. Эта метрика важна, если важно минимизировать количество ложноотрицательных результатов.\n",
"\n",
"F1-score: Гармоническое среднее между precision и recall. Эта метрика показывает баланс между precision и recall.\n",
"\n",
"Обоснование:\n",
"\n",
"Accuracy: Хорошо подходит для задач, где классы сбалансированы.\n",
"\n",
"Precision и Recall: Важны для задач, где важно минимизировать ошибки определенного типа (ложноположительные или ложноотрицательные).\n",
"\n",
"F1-score: Позволяет оценить баланс между precision и recall."
]
},
{
"cell_type": "code",
"execution_count": 139,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Результаты для задачи регрессии:\n",
"Model: Linear Regression\n",
"Best Parameters: {}\n",
"MAE: 18355084.060291413\n",
"MSE: 1373553690261338.5\n",
"RMSE: 37061485.26788071\n",
"R²: -0.23019249389605534\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
" warnings.warn(\n",
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model: Random Forest Regression\n",
"Best Parameters: {'model__max_depth': None, 'model__n_estimators': 100}\n",
"MAE: 8207608.10893617\n",
"MSE: 390570064659169.6\n",
"RMSE: 19762845.560778175\n",
"R²: 0.6501946991290963\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model: Gradient Boosting Regression\n",
"Best Parameters: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 200}\n",
"MAE: 7140734.77304005\n",
"MSE: 290599190343546.25\n",
"RMSE: 17046970.122093435\n",
"R²: 0.7397313659978713\n",
"\n",
"Результаты для задачи классификации:\n",
"Model: Logistic Regression\n",
"Best Parameters: {'model__C': 1, 'model__solver': 'liblinear'}\n",
"Accuracy: 0.8936170212765957\n",
"Precision: 1.0\n",
"Recall: 0.4444444444444444\n",
"F1-score: 0.6153846153846154\n",
"\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAgwAAAHHCAYAAADTQQDlAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABOlElEQVR4nO3deVxUVf8H8M+AMiAwICqbAiIkSoJr+hAqqAiimaZlLiW4Wy6puVGZuGK2uOWWC6hJWqTmkpobmuvjhuZGgqikiIoPq7LO+f1hzM8RcBi4wACfd6/7yjn3zjnfOwzDd85yr0wIIUBERET0CnoVHQARERHpPiYMREREpBETBiIiItKICQMRERFpxISBiIiINGLCQERERBoxYSAiIiKNmDAQERGRRkwYiIiISCMmDNXUzZs34evrCzMzM8hkMuzYsUPS+m/fvg2ZTIawsDBJ663MvL294e3tLVl96enpGD58OKytrSGTyTBhwgTJ6tYVkZGRkMlkiIyMlKS+sLAwyGQy3L59W5L6CAgODoZMJqvoMKgcMGGoQLGxsRg1ahQaNWoEQ0NDKBQKeHp6YsmSJXj27FmZth0QEIC//voL8+bNw6ZNm9CmTZsyba88BQYGQiaTQaFQFPo63rx5EzKZDDKZDN98843W9d+/fx/BwcGIioqSINqSmz9/PsLCwvDRRx9h06ZN+PDDD8u0vYYNG+Ktt94q0zakMn/+fMmT4JflJx/5W40aNVC/fn0EBgbi3r17Zdo2UYUQVCF2794tjIyMhLm5uRg/frz44YcfxPfffy/69+8vatasKUaMGFFmbT99+lQAEJ9//nmZtaFUKsWzZ89Ebm5umbVRlICAAFGjRg2hr68vtm7dWmD/zJkzhaGhoQAgvv76a63rP3v2rAAgQkNDtXpeVlaWyMrK0rq9orRr1054enpKVp8mDg4OokePHuXWnhBC5OXliWfPnom8vDytnmdsbCwCAgIKlOfm5opnz54JpVJZ6thCQ0MFADF79myxadMmsWbNGjFs2DChr68vnJycxLNnz0rdRmWQk5NTbc61uqtRselK9RQXF4f+/fvDwcEBhw8fho2NjWrfmDFjEBMTgz179pRZ+48ePQIAmJubl1kbMpkMhoaGZVa/JnK5HJ6envjpp5/Qr18/tX3h4eHo0aMHfv3113KJ5enTp6hVqxYMDAwkrffhw4dwdXWVrL7c3FwolUrJ4ywNPT09Sd9H+vr60NfXl6w+APD391f10A0fPhx169bFV199hZ07dxZ475UlIQQyMzNhZGRUbm0CQI0aNVCjBv+UVAcckqgACxcuRHp6OtatW6eWLORzdnbGJ598onqcm5uLOXPmwMnJCXK5HA0bNsRnn32GrKwstefldxkfP34cbdu2haGhIRo1aoSNGzeqjgkODoaDgwMAYMqUKZDJZGjYsCGA5135+f9+UWFjlAcOHED79u1hbm4OExMTuLi44LPPPlPtL2oOw+HDh9GhQwcYGxvD3NwcvXr1wvXr1wttLyYmBoGBgTA3N4eZmRmGDBmCp0+fFv3CvmTgwIHYu3cvkpOTVWVnz57FzZs3MXDgwALHP3nyBJMnT4abmxtMTEygUCjg7++PS5cuqY6JjIzEG2+8AQAYMmSIqjs6/zy9vb3RrFkznD9/Hh07dkStWrVUr8vLcxgCAgJgaGhY4Pz9/PxQu3Zt3L9/v9Dzyh/Xj4uLw549e1Qx5I/LP3z4EMOGDYOVlRUMDQ3RvHlzbNiwQa2O/J/PN998g8WLF6veW9euXSvWa1uU4r5XlUolgoODYWtri1q1aqFTp064du0aGjZsiMDAwALn+uIchps3b6Jv376wtraGoaEhGjRogP79+yMlJQXA82Q1IyMDGzZsUL02+XUWNYdh79698PLygqmpKRQKBd544w2Eh4eX6DXo0KEDgOdDji+6ceMG3n33XVhYWMDQ0BBt2rTBzp07Czz/8uXL8PLygpGRERo0aIC5c+ciNDS0QNz5v+/79+9HmzZtYGRkhNWrVwMAkpOTMWHCBNjZ2UEul8PZ2RlfffUVlEqlWltbtmxB69atVeft5uaGJUuWqPbn5ORg1qxZeO2112BoaIg6deqgffv2OHDggOqYwj4fpPzMIt3BtLAC7Nq1C40aNcKbb75ZrOOHDx+ODRs24N1338Wnn36KM2fOICQkBNevX8f27dvVjo2JicG7776LYcOGISAgAOvXr0dgYCBat26N119/HX369IG5uTkmTpyIAQMGoHv37jAxMdEq/qtXr+Ktt96Cu7s7Zs+eDblcjpiYGJw4ceKVzzt48CD8/f3RqFEjBAcH49mzZ1i2bBk8PT1x4cKFAslKv3794OjoiJCQEFy4cAFr166FpaUlvvrqq2LF2adPH4wePRrbtm3D0KFDATzvXWjSpAlatWpV4Phbt25hx44deO+99+Do6IjExESsXr0aXl5euHbtGmxtbdG0aVPMnj0bX375JUaOHKn64/DizzIpKQn+/v7o378/PvjgA1hZWRUa35IlS3D48GEEBATg1KlT0NfXx+rVq/HHH39g06ZNsLW1LfR5TZs2xaZNmzBx4kQ0aNAAn376KQCgXr16ePbsGby9vRETE4OxY8fC0dERv/zyCwIDA5GcnKyWiAJAaGgoMjMzMXLkSMjlclhYWBTrtS1Kcd+rQUFBWLhwIXr27Ak/Pz9cunQJfn5+yMzMfGX92dnZ8PPzQ1ZWFsaNGwdra2vcu3cPu3fvRnJyMszMzLBp0yYMHz4cbdu2xciRIwEATk5ORdYZFhaGoUOH4vXXX0dQUBDMzc1x8eJF7Nu3r9DEUpP8P+q1a9dWlV29ehWenp6oX78+pk+fDmNjY/z888/o3bs3fv31V7zzzjsAgHv37qFTp06QyWQICgqCsbEx1q5dC7lcXmhb0dHRGDBgAEaNGoURI0bAxcUFT58+hZeXF+7du4dRo0bB3t4eJ0+eRFBQEBISErB48WIAz5P+AQMGoEuXLqrfqevXr+PEiROq90lwcDBCQkJUr2dqairOnTuHCxcuoGvXrkW+BlJ+ZpEOqegxkeomJSVFABC9evUq1vFRUVECgBg+fLha+eTJkwUAcfjwYVWZg4ODACCOHTumKnv48KGQy+Xi008/VZXFxcUVOn4fEBAgHBwcCsQwc+ZM8eJbZdGiRQKAePToUZFx57fx4jh/ixYthKWlpUhKSlKVXbp0Sejp6YnBgwcXaG/o0KFqdb7zzjuiTp06Rbb54nkYGxsLIYR49913RZcuXYQQz8fDra2txaxZswp9DTIzMwuMlcfFxQm5XC5mz56tKnvVHAYvLy8BQKxatarQfV5eXmpl+/fvFwDE3Llzxa1bt4SJiYno3bu3xnMUovA5BYsXLxYAxI8//qgqy87OFh4eHsLExESkpqaqzguAUCgU4uHDhyVu70XFfa8+ePBA1KhRo8B5BgcHCwBqcw+OHDkiAIgjR44IIYS4ePGiACB++eWXV8Za1ByG/HkHcXFxQgghkpOThampqWjXrl2BcXhN8xzy6zp48KB49OiRiI+PFxEREaJevXpCLpeL+Ph41bFdunQRbm5uIjMzU63+N998U7z22muqsnHjxgmZTCYuXryoKktKShIWFhZqcQvx/7/v+/btU4trzpw5wtjYWPz9999q5dOnTxf6+vri7t27QgghPvnkE6FQKF45z6h58+Ya5628/PlQFp9ZpBs4JFHOUlNTAQCmpqbFOv73338HAEyaNEmtPP9b5ctzHVxdXVXfeoHn3zpdXFxw69atEsf8svy5D7/99luBLs6iJCQkICoqCoGBgWrfYt3d3dG1a1fVeb5o9OjRao87dOiApKQk1WtYHAMHDkRkZCQePHiAw4cP48GDB0V+a5TL5dDTe/4rkZeXh6SkJNVwy4ULF4rdplwux5AhQ4p1rK+vL0aNGoXZs2ejT58+MDQ0VHUrl8Tvv/8Oa2trDBgwQFVWs2ZNjB8/Hunp6Th69Kja8X379kW9evVK3N7LbQOa36uHDh1Cbm4uPv74Y7Xjxo0bp7ENMzMzAMD+/fu1Gp4qyoEDB5CWlobp06c
"text/plain": [
"<Figure size 640x480 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model: Random Forest Classification\n",
"Best Parameters: {'model__max_depth': None, 'model__n_estimators': 200}\n",
"Accuracy: 1.0\n",
"Precision: 1.0\n",
"Recall: 1.0\n",
"F1-score: 1.0\n",
"\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAgwAAAHHCAYAAADTQQDlAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABTGElEQVR4nO3dd1xTV/8H8E9AE2ZAZCsibqm4q7W4RRCto2qtG5y1xYXbp8NVxWqtq1atVRyP1LbOujdYZ124qiiIW1GxbJk5vz98yM8IGAIXCPHz9nVfL3Puzbnfm1ySb84591yZEEKAiIiI6C2MSjoAIiIi0n9MGIiIiEgrJgxERESkFRMGIiIi0ooJAxEREWnFhIGIiIi0YsJAREREWjFhICIiIq2YMBAREZFWTBiKyK1bt+Dt7Q0rKyvIZDJs375d0vrv3LkDmUyGtWvXSlpvada6dWu0bt1asvqSkpIwdOhQODo6QiaTYezYsZLVXVrwPNNv+vD+VK5cGf7+/hpluX3+rV27FjKZDHfu3Cn2GGUyGaZPn17s+zU0Bp0wREVF4bPPPkOVKlVgYmICpVIJT09PLF68GC9fvizSffv5+eHKlSuYPXs2NmzYgMaNGxfp/oqTv78/ZDIZlEplrq/jrVu3IJPJIJPJ8P333+tc/6NHjzB9+nSEh4dLEG3BzZkzB2vXrsXnn3+ODRs2YMCAAUW6v8qVK6tfN5lMBnNzczRp0gTr168v0v2WNm++Tq8vqampJR1eDidPnsT06dMRFxen0/NCQ0PRvXt3ODo6Qi6Xw97eHp07d8bWrVuLJlAJlcTn3549e5gUFLEyJR1AUdm9ezc++eQTKBQKDBw4EHXq1EF6ejqOHz+OiRMn4tq1a/j555+LZN8vX77EqVOn8OWXX2LkyJFFsg9XV1e8fPkSZcuWLZL6tSlTpgxSUlKwc+dO9OrVS2Pdxo0bYWJiUuAP70ePHmHGjBmoXLky6tevn+/nHThwoED7y8uRI0fwwQcfYNq0aZLW+zb169fH+PHjAQCPHz/GL7/8Aj8/P6SlpWHYsGHFFoe+e/11ep1cLi+BaN7u5MmTmDFjBvz9/WFtbZ2v50ybNg0zZ85E9erV8dlnn8HV1RWxsbHYs2cPevTogY0bN6Jv375FG3g+RUREwMjo/3975vX5N2DAAPTu3RsKhaJI4tizZw+WLVuWa9Lw8uVLlCljsF93xcYgX8Ho6Gj07t0brq6uOHLkCJycnNTrAgICEBkZid27dxfZ/p89ewYA+f5wKAiZTAYTE5Miq18bhUIBT09P/PrrrzkShpCQEHTq1AlbtmwpllhSUlJgZmYm+ZfF06dP4e7uLll9mZmZUKlUb42zQoUK6N+/v/qxv78/qlSpgoULFzJheM2br5NUVCoV0tPTS/Rva/PmzZg5cyZ69uyJkJAQjR8FEydOxP79+5GRkVFi8b3pzQQgr88/Y2NjGBsbF1dYGkry/TQowgCNGDFCABAnTpzI1/YZGRli5syZokqVKkIulwtXV1cxdepUkZqaqrGdq6ur6NSpk/jrr7/E+++/LxQKhXBzcxPr1q1TbzNt2jQBQGNxdXUVQgjh5+en/v/rsp/zugMHDghPT09hZWUlzM3NRY0aNcTUqVPV66OjowUAERwcrPG8w4cPi+bNmwszMzNhZWUlunTpIv75559c93fr1i3h5+cnrKyshFKpFP7+/iI5OVnr6+Xn5yfMzc3F2rVrhUKhEP/++6963d9//y0AiC1btggAYv78+ep1sbGxYvz48aJOnTrC3NxcWFpaig4dOojw8HD1NkePHs3x+r1+nK1atRLvvfeeOHfunGjRooUwNTUVY8aMUa9r1aqVuq6BAwcKhUKR4/i9vb2FtbW1ePjwYa7Hl1cM0dHRQgghYmJixODBg4W9vb1QKBSibt26Yu3atRp1ZL8/8+fPFwsXLhRVqlQRRkZG4uLFi3m+rtnn15saN24s5HK5RtmxY8dEz549hYuLi5DL5aJixYpi7NixIiUlRWO77PfqwYMHomvXrsLc3FzY2tqK8ePHi8zMTI1t//33X+Hn5yeUSqWwsrISAwcOFBcvXiz0eRYRESH69esnlEqlsLW1FV999ZVQqVTi3r17okuXLsLS0lI4ODiI77//Ps/XJj+v0+uSkpLEuHHjRMWKFYVcLhc1atQQ8+fPFyqVSmM7ACIgIED897//Fe7u7qJMmTJi27ZtQgghHjx4IAYNGiTs7e2FXC4X7u7uYvXq1Tn2tWTJEuHu7i5MTU2FtbW1aNSokdi4caPGa5DXuZSbWrVqCRsbG5GQkKD1tcjtc+DSpUvCz89PuLm5CYVCIRwcHMSgQYPE8+fPNZ6bkJAgxowZI1xdXYVcLhd2dnbCy8tLnD9/Xr3NzZs3Rffu3YWDg4NQKBSiQoUK4tNPPxVxcXHqbVxdXYWfn1+ex5v9mRccHJzrse/Zs0e0bNlSWFhYCEtLS9G4cWP16ydE/s51Pz+/XF/nbADEtGnTNPZ74cIF0aFDB2FpaSnMzc1F27ZtxalTpzS2yY75+PHjIjAwUNja2gozMzPRrVs38fTpU63vj6ExyBaGnTt3okqVKvjwww/ztf3QoUOxbt069OzZE+PHj8eZM2cQFBSE69evY9u2bRrbRkZGomfPnhgyZAj8/PywZs0a+Pv7o1GjRnjvvffQvXt3WFtbIzAwEH369EHHjh1hYWGhU/zXrl3DRx99hLp162LmzJlQKBSIjIzEiRMn3vq8Q4cOwdfXF1WqVMH06dPx8uVLLF26FJ6enrhw4QIqV66ssX2vXr3g5uaGoKAgXLhwAb/88gvs7e3x3Xff5SvO7t27Y8SIEdi6dSsGDx4M4FXrQq1atdCwYcMc29++fRvbt2/HJ598Ajc3N8TExGDlypVo1aoV/vnnHzg7O6N27dqYOXMmvvnmGwwfPhwtWrQAAI33MjY2Fr6+vujduzf69+8PBweHXONbvHgxjhw5Aj8/P5w6dQrGxsZYuXIlDhw4gA0bNsDZ2TnX59WuXRsbNmxAYGAgKlasqG76trOzw8uXL9G6dWtERkZi5MiRcHNzwx9//AF/f3/ExcVhzJgxGnUFBwcjNTUVw4cPh0KhgI2NTb5e22yZmZl48OABypUrp1H+xx9/ICUlBZ9//jnKly+Pv//+G0uXLsWDBw/wxx9/aGyblZUFHx8fNG3aFN9//z0OHTqEBQsWoGrVqvj8888BAEIIdO3aFcePH8eIESNQu3ZtbNu2DX5+fjli0vU8+/TTT1G7dm3MnTsXu3fvxrfffgsbGxusXLkSbdu2xXfffYeNGzdiwoQJeP/999GyZUutr0tGRgaeP3+uUWZmZgYzMzMIIdClSxccPXoUQ4YMQf369bF//35MnDgRDx8+xMKFCzWed+TIEfz+++8YOXIkbG1tUblyZcTExOCDDz6ATCbDyJEjYWdnh71792LIkCFISEhQD4BdtWoVRo8ejZ49e2LMmDFITU3F5cuXcebMGfTt2xfdu3fHzZs38euvv2LhwoWwtbUF8Opcys2tW7dw48YNDB48GJaWllpfh9wcPHgQt2/fxqBBg+Do6Kjufr127RpOnz4NmUwGABgxYgQ2b96MkSNHwt3dHbGxsTh+/DiuX7+Ohg0bIj09HT4+PkhLS8OoUaPg6OiIhw8fYteuXYiLi4OVlVWOfev6+bd27VoMHjwY7733HqZOnQpra2tcvHgR+/btU3e55Odc/+yzz/Do0SMcPHgQGzZs0PoaXbt2DS1atIBSqcSkSZNQtmxZrFy5Eq1bt0ZYWBiaNm2qsf2oUaNQrlw5TJs2DXfu3MGiRYswcuRI/Pbbb/l+XwxCSWcsUouPjxcARNeuXfO1fXh4uAAghg4dqlE+YcIEAUAcOXJEXebq6ioAiGPHjqnLnj59KhQKhRg/fry67PVfl6/LbwvDwoULBQDx7NmzPOPO7ZdF/fr1hb29vYiNjVWXXbp0SRgZGYmBAwfm2N/gwYM16vz4449F+fLl89zn68dhbm4uhBCiZ8+eol27dkIIIbKysoSjo6OYMWNGrq9BamqqyMrKynEcCoVCzJw
"text/plain": [
"<Figure size 640x480 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model: Gradient Boosting Classification\n",
"Best Parameters: {'model__learning_rate': 0.01, 'model__max_depth': 5, 'model__n_estimators': 200}\n",
"Accuracy: 0.9148936170212766\n",
"Precision: 0.8571428571428571\n",
"Recall: 0.6666666666666666\n",
"F1-score: 0.75\n",
"\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAgwAAAHHCAYAAADTQQDlAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABTrElEQVR4nO3deVwU9f8H8NeC7oLAgigIKCKgoqh4lhHeIohKKpppqeCZhlqoeXzLvMWvVlpmat6apOX59RYPNM+88EoJEG+8UG459/P7w9ifK+CyMMCCr2ePeeTOzM68Z3eZfe/78/nMyIQQAkRERERvYFDaARAREZH+Y8JAREREWjFhICIiIq2YMBAREZFWTBiIiIhIKyYMREREpBUTBiIiItKKCQMRERFpxYSBiIiItGLCUACRkZHw8vKCubk5ZDIZtm/fLun2b926BZlMhjVr1ki63bKsXbt2aNeunWTbS05OxtChQ2FjYwOZTIYvvvhCsm3ru7w+X9OmTYNMJiu9oMqZsvJ66sO5platWggICNCYl9c5ds2aNZDJZLh161aJxyiTyTBt2rQS36++KzMJQ3R0ND799FM4OTnByMgISqUSHh4e+OGHH/DixYti3be/vz+uXLmC2bNnY/369WjRokWx7q8kBQQEQCaTQalU5vk6RkZGQiaTQSaT4dtvv9V5+w8ePMC0adMQHh4uQbSFN2fOHKxZswYjR47E+vXrMWDAgGLfp0qlwrp169CpUydUrVoVFStWhLW1Nby8vPDLL78gPT292GMoTbq+9zlfEK9O1tbWaN++Pfbu3Vu8wRZAamoqpk2bhrCwsNIOJU9hYWHw8/ODjY0N5HI5rK2t4evri61bt5Z2aFqVxjl2z549TAp0JcqAXbt2CWNjY2FhYSHGjBkjfvnlF/HTTz+Jvn37iooVK4phw4YV275TU1MFAPHVV18V2z5UKpV48eKFyMrKKrZ95Mff319UqFBBGBoaik2bNuVaPnXqVGFkZCQAiPnz5+u8/bNnzwoAYvXq1To9Lz09XaSnp+u8v/y0bNlSeHh4SLY9bVJTU4W3t7cAIN5//30RHBwsVq1aJb799lvh6+srDA0NxeDBg0sklpiYmFzvQWZmpnjx4kWx7lfX93716tUCgJgxY4ZYv369WLdunZg/f75o0KCBACB27txZrPFq8+TJEwFATJ06Ndeykng93+Sbb74RAESdOnXEN998I1auXCnmzZsn2rVrJwCIDRs2CCHy/iyUtLS0NJGRkaF+nN85NisrS7x48UKoVKpiiSMwMFDk9xX44sULkZmZWSz7LcsqlEaSoouYmBj07dsXDg4OOHz4MGxtbdXLAgMDERUVhd27dxfb/p88eQIAsLCwKLZ9yGQyGBkZFdv2tVEoFPDw8MBvv/2GPn36aCwLCQlB165dsWXLlhKJJTU1FZUqVYJcLpd0u48fP4arq6tk28vKyoJKpco3zqCgIOzfvx8LFy7E559/rrFs3LhxiIyMRGhoaJH2URQVKlRAhQr6+efv4+Oj8QtzyJAhqFatGn777Td069atFCPLX2m+nps3b8aMGTPQu3dvhISEoGLFiuplX375Jfbv34/MzMxSiS0vCoVC43F+51hDQ0MYGhqWVFgaSvN8rNdKO2PRZsSIEQKAOHHiRIHWz8zMFDNmzBBOTk5CLpcLBwcHMXnyZJGWlqaxnoODg+jatav4888/xTvvvCMUCoVwdHQUa9euVa8zdepUAUBjcnBwEEK8/GWe8+9X5TznVQcOHBAeHh7C3NxcmJiYiLp164rJkyerl+eX9R86dEi0atVKVKpUSZibm4sPPvhA/P3333nuLzIyUvj7+wtzc3OhVCpFQECASElJ0fp6+fv7CxMTE7FmzRqhUCjE8+fP1cv++usvAUBs2bIlV4UhLi5OjBs3TjRs2FCYmJgIMzMz0blzZxEeHq5e58iRI7lev1ePs23btqJBgwbi3LlzonXr1sLY2Fh8/vnn6mVt27ZVb2vgwIFCoVDkOn4vLy9hYWEh7t+/n+fx5RdDTEyMEEKIR48eicGDBwtra2uhUCiEm5ubWLNmjcY2ct6f+fPniwULFggnJydhYGAgLl68mOc+79y5IwwNDUXnzp3f8MpretM+0tPTxZQpU0SzZs2EUqkUlSpVEq1atRKHDx/OtZ3nz58Lf39/oVQqhbm5uRg4cKC4ePFirs9XXp9TIYRYv369aNasmTAyMhKVK1cWH330kbhz547GOjnv27Vr10S7du2EsbGxsLOzE//973/V62h77/OSU2E4e/asxnyVSiWUSqUYOHCgxvzk5GQxduxYUaNGDSGXy0XdunXF/Pnzc/0iLeg54ezZs8LLy0tUqVJFGBkZiVq1aolBgwZpvD+vTznVhrxeTwAiMDBQbNu2TTRo0EDI5XLh6uoq9u7dm+vYjxw5Ipo3by4UCoVwcnISS5cuzfc9el29evWEpaWlSExM1LpuXueaS5cuCX9/f+Ho6CgUCoWoVq2aGDRokHj69KnGcxMTE8Xnn38uHBwchFwuF1ZWVsLT01OcP39evc4///wj/Pz8RLVq1YRCoRDVq1cXH330kYiPj1ev4+DgIPz9/TVet7zOsTmfh5y/1Rx79uwRbdq0EaampsLMzEy0aNFCXUERQohjx46J3r17C3t7eyGXy0WNGjXEF198IVJTU9Xr+Pv75/l+5nj1vc1x4cIF0blzZ2FmZiZMTExEhw4dxKlTpzTWyYn5+PHjIigoSFStWlVUqlRJ9OjRQzx+/Fjr+6Pv9PMnxit27twJJycnvP/++wVaf+jQoVi7di169+6NcePG4cyZMwgODsb169exbds2jXWjoqLQu3dvDBkyBP7+/li1ahUCAgLQvHlzNGjQAH5+frCwsEBQUBD69euHLl26wNTUVKf4r127hm7dusHNzQ0zZsyAQqFAVFQUTpw48cbnHTx4ED4+PnBycsK0adPw4sULLFq0CB4eHrhw4QJq1aqlsX6fPn3g6OiI4OBgXLhwAStWrIC1tTX++9//FihOPz8/jBgxAlu3bsXgwYMBvKwu1KtXD82aNcu1/s2bN7F9+3Z8+OGHcHR0xKNHj7Bs2TK0bdsWf//9N+zs7FC/fn3MmDED33zzDYYPH47WrVsDgMZ7GRcXBx8fH/Tt2xf9+/dHtWrV8ozvhx9+wOHDh+Hv749Tp07B0NAQy5Ytw4EDB7B+/XrY2dnl+bz69etj/fr1CAoKQo0aNTBu3DgAgJWVFV68eIF27dohKioKo0aNgqOjI/744w8EBAQgPj4+V2Vg9erVSEtLw/Dhw6FQKGBpaZnnPvfu3Yvs7Gz0799fy6ueW177SExMxIoVK9CvXz8MGzYMSUlJWLlyJby9vfHXX3+hSZMmAAAhBLp3747jx49jxIgRqF+/PrZt2wZ/f/8C7Xv27NmYMmUK+vTpg6FDh+LJkydYtGgR2rRpg4sXL2r8Anz+/Dk6d+4MPz8/9OnTB5s3b8bEiRPRqFEj+Pj4FOi9z09CQgKePn0KIQQeP36MRYsWITk5WeP1FELggw8+wJEjRzBkyBA0adIE+/fvx5dffon79+9jwYIF6nULck54/PgxvLy8YGVlhUmTJsHCwgK3bt1St/9bWVlhyZIlGDlyJHr27Ak/Pz8AgJub2xuP5fjx49i6dSs+++wzmJmZ4ccff0SvXr1w584dVKlSBQBw8eJFdO7cGba2tpg+fTqys7MxY8YMWFlZaX2tIiMjcePGDQwePBhmZmZa189LaGgobt68iUGDBsHGxgbXrl3DL7/8gmvXruH06dPqzpwjRozA5s2bMWrUKLi6uiIuLg7Hjx/H9evX0axZM2RkZMDb2xvp6ekYPXo0bGxscP/+fezatQvx8fEwNzfPtW9dz7Fr1qzB4MGD0aBBA0yePBkWFha4ePEi9u3bh48//hgA8McffyA1NRUjR45ElSpV8Ndff2HRokW4d+8e/vjjDwDAp59+igcPHiA0NBTr16/X+hpdu3YNrVu3hlKpxIQJE1CxYkUsW7YM7dq1w9GjR9GyZUuN9UePHo3KlStj6tSpuHXrFhYuXIhRo0Z
"text/plain": [
"<Figure size 640x480 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjcAAAGwCAYAAABVdURTAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABHvUlEQVR4nO3deXhU5d3/8c9MkpksZCEJSQgEQqIiCLIKBKv+VBSXotS20uqjSC3WBbRSW0EFxAWsC8UqSsVS1EcL6mNbn0LhESpWIIgscUEWSQJBIEASkkkC2Wbu3x9JBgcCZnAWMnm/rivXxZw5Z+abo3I+nnPf99dijDECAAAIEdZgFwAAAOBLhBsAABBSCDcAACCkEG4AAEBIIdwAAICQQrgBAAAhhXADAABCSniwCwg0l8ulffv2KTY2VhaLJdjlAACAVjDGqLKyUunp6bJaT31vpt2Fm3379ikjIyPYZQAAgNOwZ88ede3a9ZT7tLtwExsbK6nx5MTFxQW5GgAA0BoOh0MZGRnu6/iptLtw0/woKi4ujnADAEAb05ohJQwoBgAAIYVwAwAAQgrhBgAAhBTCDQAACCmEGwAAEFIINwAAIKQQbgAAQEgh3AAAgJBCuAEAACGFcAMAAEJKUMPNf/7zH40aNUrp6emyWCz6+9///p3HrFq1SgMHDpTdbtdZZ52lhQsX+r1OAADQdgQ13FRXV6tfv36aO3duq/YvLCzUtddeq0svvVR5eXn69a9/rV/+8pdavny5nysFAABtRVAbZ1599dW6+uqrW73/vHnz1KNHDz333HOSpF69emn16tX6wx/+oJEjR/qrTAAA0EoHHTWqrG1QdqcOQauhTXUFz83N1YgRIzy2jRw5Ur/+9a9Pekxtba1qa2vdrx0Oh7/KAwCg3SmtqtW6gjLlFpQoN79U+Yeqdfm5KfrzbRcEraY2FW6Ki4uVmprqsS01NVUOh0NHjx5VVFTUCcfMmjVLM2bMCFSJAACEtIoj9VpXWKrc/FKtKyjVtuJKj/ctFqm6riFI1TVqU+HmdEyZMkWTJk1yv3Y4HMrIyAhiRQAAtB1VtQ36tLBMa/NLlFtQqi37HDLGc5+eqbHKyU5STnaShvZIVEK0LTjFNmlT4SYtLU0HDhzw2HbgwAHFxcW1eNdGkux2u+x2eyDKAwCgzTta59SG3WXKzS/V2vxSfbG3Qk6XZ5rJ6hSjnKwkDc9O1tCsRCV3OLOus20q3OTk5Gjp0qUe2z744APl5OQEqSIAANq2mnqnNheVK7egVLn5JcrbU656p2eY6ZYY3RhmzkrSsKwkpcZFBqna1glquKmqqtLOnTvdrwsLC5WXl6fExER169ZNU6ZM0d69e/X6669Lku688069+OKL+t3vfqdf/OIX+ve//623335bS5YsCdavAABAm1LX4NLn35QrN79UuQWl2rj7sGobXB77pMdHalh2knKyGh81de0YHaRqT09Qw82GDRt06aWXul83j40ZO3asFi5cqP3796uoqMj9fo8ePbRkyRLdf//9ev7559W1a1e9+uqrTAMHAOAkGpwubdnn0NqmMLNhV5mO1Dk99ukUa3cHmZysJHVPipbFYglSxd+fxZjjhwWFNofDofj4eFVUVCguLi7Y5QAA4FMul9HWYkfjnZn8Uq0vLFNlrefspY7REe4gk5OdpOxOHc74MOPN9btNjbkBAACejDH6+mBV0wDgEn1SWKbyI/Ue+8RGhmtojyQNb5rR1DM1VlbrmR1mvg/CDQAAbYgxRoUl1U0DgBvXmimpqvPYJ8YWpgt6JDaGmaxk9U6PU1gIh5njEW4AADjD7Sk74h4AnJtfqmJHjcf7kRFWDe6e6F5rpm+XeEWEBbV9ZFARbgAAOMPsrzjqHjOTW1Cqbw4f9XjfFmbVgG4J7nEz/bslyB4eFqRqzzyEGwAAguxQZa3WFTQumreuoFSFJdUe74dbLeqXkeAeADyoe0dFRhBmToZwAwBAgB2urtMnTf2Z1uaX6uuDVR7vWy1Sny7x7jszF2QmKsbOJbu1OFMAAPiZo6Ze6wvKlNt0d2Zb8Yn9mXp1jmtqaZCkC3okKj4qIjjFhgDCDQAAPlZd26BPdzWGmXVN/ZmOa8+ks1M6KCe7McwM7ZGkjjHBbTYZSgg3AAB8TzX1Tm3afdi9CvBne8rVcFya6ZEco2FNY2aGZSUqJfbM7s/UlhFuAADwUl2DS3l7yrU2v0S5+aXaXFSuOqdnf6YuCVHuRfNyspPUOT4qSNW2P4QbAAC+Q4PTpc/3VrinZ2/YXaaaes8wkxpn1/DsZPeMpozEttVsMpQQbgAAOI7TZfTVPodyC0rc/Zmqj2s2mRRj07CmMTM5WUnqkRxzxvdnai8INwCAds/lMtpxsFJrdzaOmfmkoFSOGs9mk/FRERqWldg4o+msZJ2dcuY3m2yvCDcAgHbHGKP8Q9XKzS9pnNFUUKayas/+TLH2cA3pkdg0ADhJvTvHhXSzyVBCuAEAhDxjjIqa+jM1z2g6VFnrsU9URGOzyeYxM33S4xTejvsztWWEGwBASNpb/q3+TPkl2lfh2WzSFm7VoG4d3TOazu+aIFs4YSYUEG4AACHhoKPG3TV7bX6pisqOeLwfEWZR/4wE5TTNaBrQLYH+TCGKcAMAaJNKq2q1rqDMPaMp/5Bns8kwq0V9m/ozDW9qNhlt47LXHvBPGQDQJlQcqde6pmaT6wpKta240uN9i0U6Lz3OPWbmgsxExUbSn6k9ItwAAM5IVbUN+rSwrHEV4IJSbdl3YrPJc9Nij7U06JGk+GjCDAg3AIAzxNE6pzbsLnOPmflib4Wcx/VnyuoU07RoXrKGZSUqqYM9SNXiTEa4AQAERU29U5uLypsGAZcob0+56p2eYaZbYnTTonmNa82kxtFsEt+NcAMACIi6Bpc+/6a8cWp2Qak27j6s2gbP/kzp8ZFNLQ2SlZOdpC4JNJuE9wg3AAC/aHC6tGWfw71o3oZdZTpyXH+mTrF29wDg4dlJ6pYYTUsDfG+EGwCAT7hcRluLHe6F89YXlqmy1rM/U2KMzd2fKSc7Sdmd6M8E3yPcAABOizFGXx+sahoAXKJPCstUfqTeY5/YyPDG2UxNYaZnaiz9meB3hBsAQKsYY1RYUu1eBXhdQalKqjybTcbYGvszNc9o6p0epzDCDAKMcAMAOKk9Tc0mmwNNscOzP1NkhFWDuzd2zs7JTlLfLvGKoNkkgoxwAwBw21/xrWaTBaX65vBRj/dtYVYN6JbQGGayktS/W4Ls4fRnwpmFcAMA7dihylqtK2hcNG9dQakKSzz7M4VbLeqXkeAeMzOoe0eaTeKMR7gBgHbkcHWdPik81jn764NVHu9bLVLfLvEa1nRn5oLMRMXYuVSgbeHfWAAIYY6aeq0vKFNu092ZbcUn9mfq1TmuaQBwki7okaj4KPozoW0j3ABACKmubdCnuxrDzLqm/kzHtWfS2Skd3IvmDe2RpI4xtuAUC/gJ4QYA2rCaeqc27T7sXgX4sz3lajguzfRIjjnWOTsrUSmx9GdCaCPcAEAbUtfgUt6ecq3NL1Fufqk2F5WrzunZn6lLQlTjY6amn87x9GdC+0K4AYAzWIPTpc/3VrinZ2/YXaaaes8wkxpnb2w02XR3JiMxOkjVAmcGwg0AnEGcLqOv9jmUW1Di7s9UfVyzyaQYW1Pn7MZBwD2SY+jPBHwL4QYAgsjlMtpxsFJrdzaOmfmkoFSOGs9mk/FRERqWldh4dyY7SWen0GwSOBXCDQAEkDFG+YeqlZtf0jijqaBMZdWe/Zli7eEa0uNYS4NeaXE0mwS8QLgBAD8yxqioqT9T84ymQ5W1HvtERTQ2m8zJanzUdF56nMLpzwScNsINAPjY3vJv9WfKL9G+Cs9mk7ZwqwZ37+geAHx+1wTZwgkzgK8QbgDgezroqHF3zV6bX6qisiMe70eEWdQ/I0E5TTOaBnRLoD8T4EeEGwDwUmlVrdYVlLlnNOUf8mw2GWa1qG+XePdaM4O6d1S0jb9ugUDhvzYA+A4VR+q1rqnZ5LqCUm0rrvR432KRzkuPaxozk6zBmR0VG0l
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from sklearn.model_selection import train_test_split, GridSearchCV\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.linear_model import LinearRegression, LogisticRegression\n",
"from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n",
"from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.preprocessing import OneHotEncoder\n",
"from sklearn import metrics\n",
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay\n",
"\n",
"# Загружаем набор данных\n",
"df = pd.read_csv(\n",
" \".//static//csv///world-population-by-country-2020.csv\", index_col=\"no\"\n",
")\n",
"\n",
"df[\"Population2020\"] = df[\"Population2020\"].apply(lambda x: int(\"\".join(x.split(\",\"))))\n",
"df[\"NetChange\"] = df[\"NetChange\"].apply(lambda x: int(\"\".join(x.split(\",\"))))\n",
"df[\"Yearly Change\"] = df[\"Yearly Change\"].apply(lambda x: float(\"\".join(x.rstrip(\"%\"))))\n",
"df[\"LandArea\"] = df[\"LandArea\"].apply(lambda x: int(\"\".join(x.split(\",\"))))\n",
"df[\"Density\"] = df[\"Density\"].apply(lambda x: int(\"\".join(x.split(\",\"))))\n",
"\n",
"numerical_cols = [\"NetChange\", \"Yearly Change\", \"LandArea\", \"Density\"]\n",
"\n",
"# Создаем преобразователь для категориальных и числовых столбцов\n",
"preprocessor = ColumnTransformer(\n",
" transformers=[\n",
" ('num', StandardScaler(), numerical_cols)\n",
" ])\n",
"\n",
"# Список моделей и их гиперпараметров для задачи регрессии\n",
"models_reg = {\n",
" \"Linear Regression\": (LinearRegression(), {}),\n",
" \"Random Forest Regression\": (RandomForestRegressor(), {\n",
" 'model__n_estimators': [100, 200],\n",
" 'model__max_depth': [None, 10, 20]\n",
" }),\n",
" \"Gradient Boosting Regression\": (GradientBoostingRegressor(), {\n",
" 'model__n_estimators': [100, 200],\n",
" 'model__learning_rate': [0.01, 0.1],\n",
" 'model__max_depth': [3, 5]\n",
" })\n",
"}\n",
"\n",
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи регрессии\n",
"X_reg = df[numerical_cols]\n",
"y_reg = df['Population2020']\n",
"\n",
"# Разделяем данные на обучающую и тестовую выборки для задачи регрессии\n",
"X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)\n",
"\n",
"# Обучаем и оцениваем модели для задачи регрессии\n",
"print(\"Результаты для задачи регрессии:\")\n",
"for name, (model, params) in models_reg.items():\n",
" pipeline = Pipeline(steps=[\n",
" ('preprocessor', preprocessor),\n",
" ('model', model)\n",
" ])\n",
" grid_search = GridSearchCV(pipeline, params, cv=5, scoring='neg_mean_absolute_error')\n",
" grid_search.fit(X_train_reg, y_train_reg)\n",
" best_model = grid_search.best_estimator_\n",
" y_pred_reg = best_model.predict(X_test_reg)\n",
" mae = mean_absolute_error(y_test_reg, y_pred_reg)\n",
" mse = mean_squared_error(y_test_reg, y_pred_reg)\n",
" rmse = mean_squared_error(y_test_reg, y_pred_reg, squared=False)\n",
" r2 = r2_score(y_test_reg, y_pred_reg)\n",
" print(f\"Model: {name}\")\n",
" print(f\"Best Parameters: {grid_search.best_params_}\")\n",
" print(f\"MAE: {mae}\")\n",
" print(f\"MSE: {mse}\")\n",
" print(f\"RMSE: {rmse}\")\n",
" print(f\"R²: {r2}\")\n",
" print()\n",
"\n",
"# Список моделей и их гиперпараметров для задачи классификации\n",
"models_class = {\n",
" \"Logistic Regression\": (LogisticRegression(), {\n",
" 'model__C': [0.1, 1, 10],\n",
" 'model__solver': ['liblinear', 'lbfgs']\n",
" }),\n",
" \"Random Forest Classification\": (RandomForestClassifier(), {\n",
" 'model__n_estimators': [100, 200],\n",
" 'model__max_depth': [None, 10, 20]\n",
" }),\n",
" \"Gradient Boosting Classification\": (GradientBoostingClassifier(), {\n",
" 'model__n_estimators': [100, 200],\n",
" 'model__learning_rate': [0.01, 0.1],\n",
" 'model__max_depth': [3, 5]\n",
" })\n",
"}\n",
"\n",
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи классификации\n",
"X_class = df[numerical_cols]\n",
"y_class = (df['Population2020'] > df['Population2020'].mean()).astype(int)\n",
"\n",
"# Разделяем данные на обучающую и тестовую выборки для задачи классификации\n",
"X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)\n",
"\n",
"# Обучаем и оцениваем модели для задачи классификации\n",
"print(\"Результаты для задачи классификации:\")\n",
"for name, (model, params) in models_class.items():\n",
" pipeline = Pipeline(steps=[\n",
" ('preprocessor', preprocessor),\n",
" ('model', model)\n",
" ])\n",
" grid_search = GridSearchCV(pipeline, params, cv=5, scoring='accuracy')\n",
" grid_search.fit(X_train_class, y_train_class)\n",
" best_model = grid_search.best_estimator_\n",
" y_pred_class = best_model.predict(X_test_class)\n",
" accuracy = accuracy_score(y_test_class, y_pred_class)\n",
" precision = precision_score(y_test_class, y_pred_class)\n",
" recall = recall_score(y_test_class, y_pred_class)\n",
" f1 = f1_score(y_test_class, y_pred_class)\n",
" print(f\"Model: {name}\")\n",
" print(f\"Best Parameters: {grid_search.best_params_}\")\n",
" print(f\"Accuracy: {accuracy}\")\n",
" print(f\"Precision: {precision}\")\n",
" print(f\"Recall: {recall}\")\n",
" print(f\"F1-score: {f1}\")\n",
" print()\n",
"\n",
" # Визуализация матрицы ошибок\n",
" cm = confusion_matrix(y_test_class, y_pred_class)\n",
" disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Less', 'More'])\n",
" disp.plot(cmap=plt.cm.Blues)\n",
" plt.title(f'Confusion Matrix for {name}')\n",
" plt.show()\n",
"\n",
" fpr, tpr, _ = metrics.roc_curve(y_test_class, y_pred_class)\n",
"# построение ROC кривой\n",
"plt.plot(fpr, tpr)\n",
"plt.ylabel(\"True Positive Rate\")\n",
"plt.xlabel(\"False Positive Rate\")\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Давайте проанализируем полученные значения метрик и определим, являются ли они нормальными или их можно улучшить.\n",
"\n",
"### Оценка смещения и дисперсии для задачи регрессии:\n",
"\n",
"### Вывод для задачи регрессии:\n",
"\n",
"- **Random Forest Regression** демонстрирует наилучшие результаты по метрикам MAE и R², что указывает на высокую точность и стабильность модели.\n",
"- **Linear Regression** и **Gradient Boosting Regression** также показывают хорошие результаты, но уступают случайному лесу.\n",
"\n",
"### Вывод для задачи классификации:\n",
"\n",
"- **Random Forest Classification** демонстрирует наилучшие результаты по всем метрикам (Accuracy, Precision, Recall, F1-score), что указывает на высокую точность и стабильность модели.\n",
"- **Logistic Regression** и **Gradient Boosting Classification** также показывают хорошие результаты, но уступают случайному лесу.\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Для оценки смещения (bias) и дисперсии (variance) моделей можно использовать метод перекрестной проверки (cross-validation). Этот метод позволяет оценить, насколько хорошо модель обобщается на новых данных.\n",
"\n",
"Оценка смещения и дисперсии для задачи регрессии:\n",
"Для задачи регрессии мы будем использовать метрики MAE (Mean Absolute Error) и R² (R-squared) для оценки смещения и дисперсии.\n",
"\n",
"Оценка смещения и дисперсии для задачи классификации:\n",
"Для задачи классификации мы будем использовать метрики Accuracy, Precision, Recall и F1-score для оценки смещения и дисперсии.\n",
"\n",
"Пример кода для оценки смещения и дисперсии:"
]
},
{
"cell_type": "code",
"execution_count": 140,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Оценка смещения и дисперсии для задачи регрессии:\n",
"Model: Linear Regression\n",
"MAE (Cross-Validation): Mean = 29840383.42279941, Std = 27144603.41689917\n",
"R² (Cross-Validation): Mean = -21089.965883559045, Std = 41156.25923282549\n",
"\n",
"Model: Random Forest Regression\n",
"MAE (Cross-Validation): Mean = 26665786.071446814, Std = 46081535.77323325\n",
"R² (Cross-Validation): Mean = -363.4188861822276, Std = 719.8191512710059\n",
"\n",
"Model: Gradient Boosting Regression\n",
"MAE (Cross-Validation): Mean = 25735773.24094847, Std = 45055407.0307318\n",
"R² (Cross-Validation): Mean = -457.52444446928376, Std = 910.2095956390277\n",
"\n",
"Оценка смещения и дисперсии для задачи классификации:\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model: Logistic Regression\n",
"Accuracy (Cross-Validation): Mean = 0.8553191489361703, Std = 0.05613151471605499\n",
"Precision (Cross-Validation): Mean = 0.6947368421052632, Std = 0.40276055725703136\n",
"Recall (Cross-Validation): Mean = 0.40555555555555556, Std = 0.4034381559533162\n",
"F1-score (Cross-Validation): Mean = 0.3924603174603175, Std = 0.31860013632010536\n",
"\n",
"Model: Random Forest Classification\n",
"Accuracy (Cross-Validation): Mean = 0.8340425531914895, Std = 0.21714639697315868\n",
"Precision (Cross-Validation): Mean = 0.8457142857142858, Std = 0.3085714285714286\n",
"Recall (Cross-Validation): Mean = 0.7277777777777777, Std = 0.18604891166267057\n",
"F1-score (Cross-Validation): Mean = 0.7121367521367521, Std = 0.20909441956885236\n",
"\n",
"Model: Gradient Boosting Classification\n",
"Accuracy (Cross-Validation): Mean = 0.8340425531914895, Std = 0.2179786974445225\n",
"Precision (Cross-Validation): Mean = 0.8486486486486486, Std = 0.3027027027027027\n",
"Recall (Cross-Validation): Mean = 0.725, Std = 0.2549509756796392\n",
"F1-score (Cross-Validation): Mean = 0.7126265039308517, Std = 0.21811324450544675\n",
"\n"
]
}
],
"source": [
"import pandas as pd\n",
"from sklearn.model_selection import cross_val_score\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.linear_model import LinearRegression, LogisticRegression\n",
"from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n",
"from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.preprocessing import OneHotEncoder\n",
"\n",
"# Загружаем набор данных\n",
"df = pd.read_csv(\n",
" \".//static//csv///world-population-by-country-2020.csv\", index_col=\"no\"\n",
")\n",
"\n",
"df[\"Population2020\"] = df[\"Population2020\"].apply(lambda x: int(\"\".join(x.split(\",\"))))\n",
"df[\"NetChange\"] = df[\"NetChange\"].apply(lambda x: int(\"\".join(x.split(\",\"))))\n",
"df[\"Yearly Change\"] = df[\"Yearly Change\"].apply(lambda x: float(\"\".join(x.rstrip(\"%\"))))\n",
"df[\"LandArea\"] = df[\"LandArea\"].apply(lambda x: int(\"\".join(x.split(\",\"))))\n",
"df[\"Density\"] = df[\"Density\"].apply(lambda x: int(\"\".join(x.split(\",\"))))\n",
"\n",
"# Определяем категориальные и числовые столбцы\n",
"\n",
"numerical_cols = [\"NetChange\", \"Yearly Change\", \"LandArea\", \"Density\"]\n",
"\n",
"# Создаем преобразователь для категориальных и числовых столбцов\n",
"preprocessor = ColumnTransformer(\n",
" transformers=[\n",
" ('num', StandardScaler(), numerical_cols)\n",
" ])\n",
"\n",
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи регрессии\n",
"X_reg = df[numerical_cols]\n",
"y_reg = df['Population2020']\n",
"\n",
"# Список моделей для задачи регрессии\n",
"models_reg = {\n",
" \"Linear Regression\": LinearRegression(),\n",
" \"Random Forest Regression\": RandomForestRegressor(),\n",
" \"Gradient Boosting Regression\": GradientBoostingRegressor()\n",
"}\n",
"\n",
"# Оценка смещения и дисперсии для задачи регрессии\n",
"print(\"Оценка смещения и дисперсии для задачи регрессии:\")\n",
"for name, model in models_reg.items():\n",
" pipeline = Pipeline(steps=[\n",
" ('preprocessor', preprocessor),\n",
" ('model', model)\n",
" ])\n",
" mae_scores = -cross_val_score(pipeline, X_reg, y_reg, cv=5, scoring='neg_mean_absolute_error')\n",
" r2_scores = cross_val_score(pipeline, X_reg, y_reg, cv=5, scoring='r2')\n",
" print(f\"Model: {name}\")\n",
" print(f\"MAE (Cross-Validation): Mean = {mae_scores.mean()}, Std = {mae_scores.std()}\")\n",
" print(f\"R² (Cross-Validation): Mean = {r2_scores.mean()}, Std = {r2_scores.std()}\")\n",
" print()\n",
"\n",
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи классификации\n",
"X_class = df[numerical_cols]\n",
"y_class = (df['Population2020'] > df['Population2020'].mean()).astype(int)\n",
"\n",
"# Список моделей для задачи классификации\n",
"models_class = {\n",
" \"Logistic Regression\": LogisticRegression(),\n",
" \"Random Forest Classification\": RandomForestClassifier(),\n",
" \"Gradient Boosting Classification\": GradientBoostingClassifier()\n",
"}\n",
"\n",
"# Оценка смещения и дисперсии для задачи классификации\n",
"print(\"Оценка смещения и дисперсии для задачи классификации:\")\n",
"for name, model in models_class.items():\n",
" pipeline = Pipeline(steps=[\n",
" ('preprocessor', preprocessor),\n",
" ('model', model)\n",
" ])\n",
" accuracy_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='accuracy')\n",
" precision_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='precision')\n",
" recall_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='recall')\n",
" f1_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='f1')\n",
" print(f\"Model: {name}\")\n",
" print(f\"Accuracy (Cross-Validation): Mean = {accuracy_scores.mean()}, Std = {accuracy_scores.std()}\")\n",
" print(f\"Precision (Cross-Validation): Mean = {precision_scores.mean()}, Std = {precision_scores.std()}\")\n",
" print(f\"Recall (Cross-Validation): Mean = {recall_scores.mean()}, Std = {recall_scores.std()}\")\n",
" print(f\"F1-score (Cross-Validation): Mean = {f1_scores.mean()}, Std = {f1_scores.std()}\")\n",
" print()"
]
},
{
"cell_type": "code",
"execution_count": 141,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABJsAAAI+CAYAAAAb9gt6AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACJJElEQVR4nOzdeZxO5eP/8fes92xmGIYZGUZU1tDIEoVMM5ZIWUILErJmKUuLLeUThUKWFlqQNiplXxOVbEXImmLGFobBzJi5fn/4zfnObWaY0TG3Ma/n4+Fh7nOuc851zn3d51z3+z6LmzHGCAAAAAAAALCBu6srAAAAAAAAgJsHYRMAAAAAAABsQ9gEAAAAAAAA2xA2AQAAAAAAwDaETQAAAAAAALANYRMAAAAAAABsQ9gEAAAAAAAA2xA2AQAAAAAAwDaETQAAAAAAALANYRNcomPHjgoICMjVZR44cEBubm6aOXNmri73ZvbZZ58pODhYZ8+edXVVslSrVi0NHDgw2+XPnj2rp59+WqGhoXJzc1Pfvn2vX+VgcXNz0/Dhw11djVzzX/ZHq1atkpubm1atWmV7vQAgv8rJceiXX36Rt7e3/vrrL9vrMXXqVJUsWVKJiYnZnubjjz9WuXLl5OXlpYIFC9peJ2TUsWNHRUREuLoauap+/fqqX7/+NU0bERGhjh072lof3PgIm1xs5syZcnNzk5ubm9auXZthvDFG4eHhcnNz04MPPuiCGuZcSkqKihcvLjc3Ny1cuNDV1bHFuXPnNHz48Ovy5S7t/c/s3zPPPGP78uySkpKiYcOGqXfv3k7BYUREhNzc3BQVFZXpdO+++661fr/++mumZQYOHCg3Nzc9+uijmY5P+6Ke1b///e9/VtlBgwZp8uTJiouLy9Z6vfbaa5o5c6a6d++ujz/+WE888US2prtWadsr7Z+/v79q1Kihjz766LouF5cMHz5cbm5ucnd3199//51hfHx8vHx9feXm5qZevXq5oIYAkD3p+5Rubm7y9PTULbfcoo4dO+rQoUOurt5N5cUXX1S7du1UqlQpa1j9+vWdtr+vr6/uvPNOTZgwQampqU7TP/HEE6pbt65q1qypevXqaceOHda4jh07KikpSdOmTctWXXbu3KmOHTuqTJkyevfddzV9+nR7VjILacfNtH9eXl6KiIhQnz59dOrUqeu6bDj3gUeNGpVpmccee0xubm65/sM+cDlPV1cAl/j4+Gj27NmqW7eu0/DVq1frn3/+kcPhcFHNcm7FihWKjY1VRESEZs2apcaNG7u6Sv/ZuXPnNGLECEm65kT/Sh544AE9+eSTGYbffvvtti/LLt9++6127dqlrl27Zhjn4+OjlStXKi4uTqGhoU7jZs2aJR8fH124cCHT+RpjNGfOHEVEROjbb7/VmTNnVKBAgUzLtmvXTk2aNMkwvFq1atbfDz30kAIDA/XOO+9o5MiRV12vFStWqFatWho2bNhVy9qlatWqGjBggCQpNjZW7733njp06KDExER16dIl1+rhSufPn5enp+sOSQ6HQ3PmzMlwFtxXX33lohoBwLUZOXKkSpcurQsXLuinn37SzJkztXbtWm3btk0+Pj6url6et2XLFi1btkzr1q3LMK5EiRIaPXq0JOn48eOaPXu2+vXrp2PHjunVV1+1yr388stWH69v377q0aOHVq5cKelSH6pDhw4aN26cevfuLTc3tyvWZ9WqVUpNTdVbb72lsmXL2rWaVzVlyhQFBAQoISFBy5cv18SJE7Vp06ZMfzy/Gb377rsZQsTc5OPjozlz5uill15yGp6QkKCvv/6azzpuCJzZdINo0qSJPv/8c128eNFp+OzZsxUZGZnhC/uN7JNPPtFdd92lfv36af78+UpISHB1lW54t99+ux5//PEM/2rUqHHF6c6dO5fp8IsXLyopKek/1elq79uMGTNUp04d3XLLLRnG1alTRwEBAZo7d67T8H/++Uc//PCDmjZtmuV8V61apX/++UcffPCBLl68eMUv+3fddVem261ixYpWGXd3d7Vq1UofffSRjDFXXCdJOnr0qK2noGfnvbjlllusuj///PNau3atAgICNH78eNvqkV2u+rz6+Pi4NGxq0qSJ5syZk2H47Nmzr9heAeBG07hxYz3++ON6+umn9d577+m5557T3r179c0337i6atdFVn2h62XGjBkqWbKkatWqlWFcUFCQdTzv27ev1qxZo1KlSmnixIlKSUmxyqX/MdEYI3d3569kbdq00V9//WUFUFdy9OhRSbK175KdbdqqVSs9/vjj6tatmz777DM9+uij+vHHH/XLL7/YVo/sSE1NzfIHzOvJy8vLpScDNGnSRH/88Ye2bt3qNPzrr79WUlKSHnjgARfVDPg/hE03iHbt2unEiRNaunSpNSwpKUlffPGF2rdvn+k0qampmjBhgipWrCgfHx8VK1ZM3bp108mTJ53Kff3112ratKmKFy8uh8OhMmXK6JVXXnE66EmXztipVKmS/vjjDzVo0EB+fn665ZZbNGbMmGyvx/nz5zVv3jy1bdtWbdq00fnz5/X1119nWX7fvn2KiYmRv7+/ihcvrpEjR2YIBD799FNFRkaqQIECCgwMVOXKlfXWW29lmE/r1q0VHBwsPz8/1apVS999991V65vVtcfpr8M+cOCAQkJCJEkjRoywTl1Nf13/zp071apVKwUHB8vHx0fVq1e3vVOX9v5s3LhR9913n/z8/PTCCy9Yp9O+8cYbmjBhgsqUKSOHw6E//vhD0qUzde699175+/urYMGCeuihh5xO15b+75ToP/74Q+3bt1ehQoUynGWX3oULF7Ro0aIsL5Xz8fHRI488otmzZzsNnzNnjgoVKqSYmJgs5z1r1ixVqFBBDRo0UFRUlGbNmpXdTZSlBx54QH/99Ze2bNmSZZm0++Ds379f3333nfU+HzhwQNKlzlznzp1VrFgx+fj4qEqVKvrwww+d5nG19yK7QkJCVK5cOe3du9dpeHY/86mpqRo+fLiKFy8uPz8/NWjQQH/88UeG6+XTLrlYvXq1evTooaJFi6pEiRLW+IULF1ptp0CBAmratKm2b9/utKy4uDh16tRJJUqUkMPhUFhYmB566CFru0nSr7/+qpiYGBUpUkS+vr4qXbq0nnrqKaf5ZHavjM2bN6tx48YKDAxUQECAGjZsqJ9++smpTNo6/Pjjj+rfv79CQkLk7++vhx9+WMeOHcvuJlf79u21ZcsW7dy502ndVqxYkeU+ODttQpJOnTqljh07KigoSAULFlSHDh2yvNTgWvclu3fvVsuWLRUaGiofHx+VKFFCbdu21enTp7O3AQDctO69915JynBMyUx29unGGI0aNUolSpSwjjHbt2/PcIxJ61tcLm2/nX6eOe2rXt4XkqTExEQNGzZMZcuWlcPhUHh4uAYOHJjh3keJiYnq16+fQkJCVKBAATVv3lz//PPPVbdNmvnz5+v++++/6hlH0qX+0N13360zZ85YoVB6y5cv13vvved0+b8kRUZGKjg4+Ip9aOnSpfhpZ2KHhIRkOJa+8847qlixohwOh4oXL66ePXtmOP5caZvmRFbt7Oeff1ajRo0UFBQkPz8/1atXTz/++GOG6VetWqXq1avLx8dHZcqU0bRp0zJtQ2mXtc+aNctat0WLFkmSDh06pKeeekrFihWTw+FQxYoV9cEHH2RY1sSJE1WxYkX5+fmpUKFCql69ulOf9cyZM+rbt68iIiLkcDhUtGhRPfDAA9q0aZNVJrN7NiUkJGjAgAEKDw+Xw+HQHXfcoTfeeCPDd5u0dZg/f74qVapk1TVtPbKjdu3aKl26dIa+9qxZs9SoUSMFBwdnOl122oQkTZ8+XWXKlJGvr69q1KihH374IdP5Zfdzd7nk5GSNGDFCt912m3x8fFS4cGHVrVvX6bsw8j4uo7tBREREqHbt2pozZ4512dnChQt1+vRptW3bVm+//XaGabp166aZM2eqU6dO6tOnj/bv369JkyZp8+bN+vHHH+Xl5SXp0kE9ICBA/fv3V0BAgFasWKGhQ4cqPj5eY8eOdZrnyZM
"text/plain": [
"<Figure size 1200x600 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABJoAAASlCAYAAADgRbP+AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzde3zO9f/H8eeO1zYz520O+5ooEg1zSEgHWRJJJMqxpFjJOq5iKO3bgXRQOjhVRA7JN3JaJOVLYVIhxxQzpzQ2ttnevz/8dn132TV28dmuHR73283Ndb2vz+F1ffbedb33/Jw8jDFGAAAAAAAAwGXydHcBAAAAAAAAKB0ImgAAAAAAAGAJgiYAAAAAAABYgqAJAAAAAAAAliBoAgAAAAAAgCUImgAAAAAAAGAJgiYAAAAAAABYgqAJAAAAAAAAliBoAgAAAAAAgCUImoBSaufOnerYsaMqVKggDw8PLVy40N0l2Q0YMEDh4eFuW//06dPl4eGhffv2ObS/9tpruuKKK+Tl5aUmTZpIksLDwzVgwIAir3H06NHy8PAo8vW60+X0ixtvvFE33nijpfUAAC7fpXy2r169Wh4eHlq9enWh1HQxzsYDxcG+ffvk4eGh6dOnu60GZ+MiZ2PO/MZaRcHDw0OjR48u8vW6y+X0C3f/rqH0ImhCsfbuu+/Kw8NDrVq1cncpJU7//v21detWjRs3Tp988omaN29e6OtMSUnRmDFjFBERocDAQPn7+6tRo0Z65plndPDgwUJf/+VYvny5nn76abVp00bTpk3Tyy+/XOjrTEtL0+jRo4vdl7uHh4c8PDz04IMPOn39+eeft09z9OjRIq4OAHAhOX/g5/zz8/PTVVddpejoaCUnJ7u7vGLPHeMB6dwf/N27d1doaKh8fX0VHBysLl26aMGCBUWy/svhjjHnkiVLil2YlLOT0NPTU3/++Wee11NSUuTv7y8PDw9FR0e7oUKg6Hi7uwDgQmbOnKnw8HBt2LBBu3btUr169dxdUolw+vRprVu3Ts8//3yRfZHt2bNHHTp00P79+9WzZ0899NBD8vX11c8//6wpU6boiy++0O+//14ktVxM3759de+998pms9nbvvnmG3l6emrKlCny9fW1t+/YsUOenoWTyaelpWnMmDGSlOdonBdeeEHPPvtsoay3IPz8/DR//ny9++67DttDkj777DP5+fnpzJkzbqoOAHAxY8eOVZ06dXTmzBmtXbtW7733npYsWaJffvlFAQEBRVbHhx9+qOzsbJfmueGGG3T69Ok83z9FIb/xQGGKi4vT2LFjdeWVV2rIkCGqXbu2jh07piVLlujuu+/WzJkz1adPnyKp5WLOHxflN+Z0Ntay0pIlSzRp0iSnYdPp06fl7e2+P3NtNps+++wzPf300w7tJSE0BKzCEU0otvbu3asffvhBEyZMULVq1TRz5kx3l5Sv1NRUd5fg4MiRI5KkihUrWrbMC73Hs2fPqnv37kpOTtbq1av12WefadiwYRo8eLDefvtt7dmzRz179rSslsvl5eUlPz8/h1PTDh8+LH9//zyDSpvNJh8fn6IuUd7e3vLz8yvy9ea47bbblJKSoq+//tqh/YcfftDevXvVuXNnN1UGACiITp066f7779eDDz6o6dOn6/HHH9fevXv15Zdf5jtPYYxnfHx8XA4bPD095efnV2g7ei4kv/HApTLG6PTp0/m+Pm/ePI0dO1Y9evTQr7/+qjFjxmjQoEF66qmntGrVKi1dulRBQUGW1GKF88dF+Y05nY21ioqfn59bg6bbb79dn332WZ72WbNmMX5CmUHQhGJr5syZqlSpkjp37qwePXrkGzSdOHFCI0aMUHh4uGw2m2rVqqV+/fo5nNJz5swZjR49WldddZX8/PxUvXp1de/eXbt375aU//nJzs55HjBggAIDA7V7927dfvvtKl++vO677z5J0nfffaeePXvqX//6l2w2m8LCwjRixAinA4zt27frnnvuUbVq1eTv76/69evr+eeflyStWrVKHh4e+uKLL/LMN2vWLHl4eGjdunVOt8fo0aNVu3ZtSdJTTz0lDw8Ph2sjbN68WZ06dVJQUJACAwN1yy236L///a/DMnIOu//22281dOhQBQcHq1atWk7XJ0nz58/Xli1b9Pzzz6tt27Z5Xg8KCtK4cePynV+SXn/9dV1//fWqUqWK/P39FRkZqXnz5uWZbsWKFWrbtq0qVqyowMBA1a9fX88995zDNG+//bauueYaBQQEqFKlSmrevLlmzZqV5/3lXDfAw8ND06ZNU2pqqv1Ug5yfubNrEVysz2VkZGjUqFGKjIxUhQoVVK5cObVr106rVq2yL2Pfvn2qVq2aJGnMmDH29ebsmXN2jaazZ8/qxRdfVN26dWWz2RQeHq7nnntO6enpDtOFh4frjjvu0Nq1a9WyZUv5+fnpiiuu0Mcff3zBn0FuNWvW1A033OCw3aRzv5eNGzdWo0aNnM43d+5cRUZGyt/fX1WrVtX999+vAwcO5Jlu4cKFatSokfz8/NSoUSOnfV2SsrOzNXHiRF1zzTXy8/NTSEiIhgwZor///vui7+Fi/QAAypKbb75Z0rkdedKFxzOufPZ+/fXXat++vcqXL6+goCC1aNHC4bPW2TWaZs+ercjISPs8jRs31ptvvml/Pb9xWUG+Y3Le14EDB9StWzcFBgaqWrVqevLJJ5WVlXXBbXSh8YCr38HLli1T8+bN5e/vr/fffz/fdY4cOVKVK1fW1KlTne7YioqK0h133JHv/D///LMGDBigK664Qn5+fgoNDdWgQYN07Ngxh+lOnjypxx9/3D52CQ4O1q233qpNmzbZp9m5c6fuvvtuhYaGys/PT7Vq1dK9996rf/75x+H95YyLLjTmzO8aTRfrLwUZSw8YMECTJk2SJIfTRHM4u0aTK+Pf77//XjExMapWrZrKlSunu+66yx6oFUSfPn2UmJio7du329sOHTqkb775Jt8j0w4fPqwHHnhAISEh8vPzU0REhGbMmJFnuhMnTmjAgAGqUKGCKlasqP79++vEiRNOl7l9+3b16NFDlStXlp+fn5o3b65FixZdtP6C9APgYjh1DsXWzJkz1b17d/n6+qp3795677339OOPP6pFixb2aU6dOqV27dpp27ZtGjRokJo1a6ajR49q0aJF+uuvv1S1alVlZWXpjjvuUEJCgu69914NHz5cJ0+e1IoVK/TLL7+obt26Ltd29uxZRUVFqW3btnr99dfth6DPnTtXaWlpeuSRR1SlShVt2LBBb7/9tv766y/NnTvXPv/PP/+sdu3aycfHRw899JDCw8O1e/du/ec//9G4ceN04403KiwsTDNnztRdd92VZ7vUrVtXrVu3dlpb9+7dVbFiRY0YMUK9e/fW7bffrsDAQEnSr7/+qnbt2ikoKEhPP/20fHx89P777+vGG2/Ut99+m+daWEOHDlW1atU0atSoC+7lzPnS6tu3r8vbMsebb76prl276r777lNGRoZmz56tnj176quvvrLv/fn11191xx136Nprr9XYsWNls9m0a9cuff/99/blfPjhh3rsscfUo0cPDR8+XGfOnNHPP/+s9evX5/vl/sknn+iDDz7Qhg0b9NFHH0mSrr/+eqfTFqTPpaSk6KOPPlLv3r01ePBgnTx5UlOmTFFUVJQ2bNigJk2aqFq1anrvvff0yCOP6K677lL37t0lSddee22+2+jBBx/UjBkz1KNHDz3xxBNav3694uPjtW3btjxBza5du9SjRw898MAD6t+/v6ZOnaoBAwYoMjJS11xzTYF+Jn369NHw4cN16tQpBQYG6uzZs5o7d65iYmKcnjY3ffp0DRw4UC1atFB8fLySk5P15ptv6vvvv9fmzZvtezuXL1+uu+++Ww0bNlR8fLyOHTumgQMHOg0zhwwZYl/uY489pr179+qdd97R5s2b9f333+d7tNml9AMAKM1ydq5VqVLF3pbfeKagn73Tp0/XoEGDdM011yg2NlYVK1bU5s2btXTp0nw/a1esWKHevXvrlltu0SuvvCJJ2rZtm77//nsNHz483/o
"text/plain": [
"<Figure size 1200x1200 with 4 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.model_selection import cross_val_score\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.linear_model import LinearRegression, LogisticRegression\n",
"from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n",
"from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.preprocessing import OneHotEncoder\n",
"\n",
"# Загружаем набор данных\n",
"df = pd.read_csv(\n",
" \".//static//csv///world-population-by-country-2020.csv\", index_col=\"no\"\n",
")\n",
"\n",
"df[\"Population2020\"] = df[\"Population2020\"].apply(lambda x: int(\"\".join(x.split(\",\"))))\n",
"df[\"NetChange\"] = df[\"NetChange\"].apply(lambda x: int(\"\".join(x.split(\",\"))))\n",
"df[\"Yearly Change\"] = df[\"Yearly Change\"].apply(lambda x: float(\"\".join(x.rstrip(\"%\"))))\n",
"df[\"LandArea\"] = df[\"LandArea\"].apply(lambda x: int(\"\".join(x.split(\",\"))))\n",
"df[\"Density\"] = df[\"Density\"].apply(lambda x: int(\"\".join(x.split(\",\"))))\n",
"\n",
"# Определяем категориальные и числовые столбцы\n",
"numerical_cols = [\"NetChange\", \"Yearly Change\", \"LandArea\", \"Density\"]\n",
"\n",
"# Создаем преобразователь для категориальных и числовых столбцов\n",
"preprocessor = ColumnTransformer(\n",
" transformers=[\n",
" ('num', StandardScaler(), numerical_cols)\n",
" ])\n",
"\n",
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи регрессии\n",
"X_reg = df[numerical_cols]\n",
"y_reg = df['Population2020']\n",
"\n",
"# Список моделей для задачи регрессии\n",
"models_reg = {\n",
" \"Linear Regression\": LinearRegression(),\n",
" \"Random Forest Regression\": RandomForestRegressor(),\n",
" \"Gradient Boosting Regression\": GradientBoostingRegressor()\n",
"}\n",
"\n",
"# Оценка смещения и дисперсии для задачи регрессии\n",
"mae_means = []\n",
"mae_stds = []\n",
"r2_means = []\n",
"r2_stds = []\n",
"\n",
"for name, model in models_reg.items():\n",
" pipeline = Pipeline(steps=[\n",
" ('preprocessor', preprocessor),\n",
" ('model', model)\n",
" ])\n",
" mae_scores = -cross_val_score(pipeline, X_reg, y_reg, cv=5, scoring='neg_mean_absolute_error')\n",
" r2_scores = cross_val_score(pipeline, X_reg, y_reg, cv=5, scoring='r2')\n",
" mae_means.append(mae_scores.mean())\n",
" mae_stds.append(mae_scores.std())\n",
" r2_means.append(r2_scores.mean())\n",
" r2_stds.append(r2_scores.std())\n",
"\n",
"# Визуализация результатов для задачи регрессии\n",
"fig, ax = plt.subplots(1, 2, figsize=(12, 6))\n",
"\n",
"ax[0].bar(models_reg.keys(), mae_means, yerr=mae_stds, align='center', alpha=0.5, ecolor='black', capsize=10)\n",
"ax[0].set_ylabel('MAE')\n",
"ax[0].set_title('Mean Absolute Error (MAE) for Regression Models')\n",
"ax[0].yaxis.grid(True)\n",
"\n",
"ax[1].bar(models_reg.keys(), r2_means, yerr=r2_stds, align='center', alpha=0.5, ecolor='black', capsize=10)\n",
"ax[1].set_ylabel('R²')\n",
"ax[1].set_title('R-squared (R²) for Regression Models')\n",
"ax[1].yaxis.grid(True)\n",
"\n",
"plt.tight_layout()\n",
"plt.show()\n",
"\n",
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи классификации\n",
"X_class = df[numerical_cols]\n",
"y_class = (df['Population2020'] > df['Population2020'].mean()).astype(int)\n",
"\n",
"# Список моделей для задачи классификации\n",
"models_class = {\n",
" \"Logistic Regression\": LogisticRegression(),\n",
" \"Random Forest Classification\": RandomForestClassifier(),\n",
" \"Gradient Boosting Classification\": GradientBoostingClassifier()\n",
"}\n",
"\n",
"# Оценка смещения и дисперсии для задачи классификации\n",
"accuracy_means = []\n",
"accuracy_stds = []\n",
"precision_means = []\n",
"precision_stds = []\n",
"recall_means = []\n",
"recall_stds = []\n",
"f1_means = []\n",
"f1_stds = []\n",
"\n",
"for name, model in models_class.items():\n",
" pipeline = Pipeline(steps=[\n",
" ('preprocessor', preprocessor),\n",
" ('model', model)\n",
" ])\n",
" accuracy_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='accuracy')\n",
" precision_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='precision')\n",
" recall_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='recall')\n",
" f1_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='f1')\n",
" accuracy_means.append(accuracy_scores.mean())\n",
" accuracy_stds.append(accuracy_scores.std())\n",
" precision_means.append(precision_scores.mean())\n",
" precision_stds.append(precision_scores.std())\n",
" recall_means.append(recall_scores.mean())\n",
" recall_stds.append(recall_scores.std())\n",
" f1_means.append(f1_scores.mean())\n",
" f1_stds.append(f1_scores.std())\n",
"\n",
"# Визуализация результатов для задачи классификации\n",
"fig, ax = plt.subplots(2, 2, figsize=(12, 12))\n",
"\n",
"ax[0, 0].bar(models_class.keys(), accuracy_means, yerr=accuracy_stds, align='center', alpha=0.5, ecolor='black', capsize=10)\n",
"ax[0, 0].set_ylabel('Accuracy')\n",
"ax[0, 0].set_title('Accuracy for Classification Models')\n",
"ax[0, 0].yaxis.grid(True)\n",
"\n",
"ax[0, 1].bar(models_class.keys(), precision_means, yerr=precision_stds, align='center', alpha=0.5, ecolor='black', capsize=10)\n",
"ax[0, 1].set_ylabel('Precision')\n",
"ax[0, 1].set_title('Precision for Classification Models')\n",
"ax[0, 1].yaxis.grid(True)\n",
"\n",
"ax[1, 0].bar(models_class.keys(), recall_means, yerr=recall_stds, align='center', alpha=0.5, ecolor='black', capsize=10)\n",
"ax[1, 0].set_ylabel('Recall')\n",
"ax[1, 0].set_title('Recall for Classification Models')\n",
"ax[1, 0].yaxis.grid(True)\n",
"\n",
"ax[1, 1].bar(models_class.keys(), f1_means, yerr=f1_stds, align='center', alpha=0.5, ecolor='black', capsize=10)\n",
"ax[1, 1].set_ylabel('F1-score')\n",
"ax[1, 1].set_title('F1-score for Classification Models')\n",
"ax[1, 1].yaxis.grid(True)\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "aisenv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}