1774 lines
212 KiB
Plaintext
1774 lines
212 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Начало лабораторной работы"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 67,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Index(['Store ID ', 'Store_Area', 'Items_Available', 'Daily_Customer_Count',\n",
|
|||
|
" 'Store_Sales'],\n",
|
|||
|
" dtype='object')\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"df = pd.read_csv(\".//static//csv//Stores.csv\")\n",
|
|||
|
"print(df.columns)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### Бизнес-цели"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"1. Прогнозирование посетителей в магазине:\n",
|
|||
|
"\n",
|
|||
|
"Цель: Разработать модель, которая будет предсказывать посещение клиентами магазина на основе его характеристик (размер, распродажи, количество ассортимента).\n",
|
|||
|
"\n",
|
|||
|
"Применение:\n",
|
|||
|
"Предсказывание посещения магазинов клиентами.\n",
|
|||
|
"\n",
|
|||
|
"2. Оптимизация параметров магазина:\n",
|
|||
|
"\n",
|
|||
|
"Цель: Определить оптимальные коэффициенты для различных факторов, влияющих на посещаемость магазина чтобы максимизировать прибыль компании при наименьших затратах на пространство магазина и его ассортиментт.\n",
|
|||
|
"\n",
|
|||
|
"Применение:\n",
|
|||
|
"Создавать магазин с максимальной посещаемостью."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"1. Прогнозирование посетителей в магазине"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 68,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Среднее значение поля 'Daily_Customer_Count': 786.3504464285714\n",
|
|||
|
" Store ID Store_Area Items_Available Daily_Customer_Count Store_Sales \\\n",
|
|||
|
"0 1 1659 1961 530 66490 \n",
|
|||
|
"1 2 1461 1752 210 39820 \n",
|
|||
|
"2 3 1340 1609 720 54010 \n",
|
|||
|
"3 4 1451 1748 620 53730 \n",
|
|||
|
"4 5 1770 2111 450 46620 \n",
|
|||
|
"\n",
|
|||
|
" above_average_count customers_volatility \n",
|
|||
|
"0 0 1550 \n",
|
|||
|
"1 0 1550 \n",
|
|||
|
"2 0 1550 \n",
|
|||
|
"3 0 1550 \n",
|
|||
|
"4 0 1550 \n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"\n",
|
|||
|
"# Загружаем набор данных\n",
|
|||
|
"df = pd.read_csv(\".//static//csv//Stores.csv\")\n",
|
|||
|
"\n",
|
|||
|
"# Устанавливаем случайное состояние\n",
|
|||
|
"random_state = 42\n",
|
|||
|
"\n",
|
|||
|
"# Рассчитываем среднее значение посещаемости\n",
|
|||
|
"average_count = df['Daily_Customer_Count'].mean()\n",
|
|||
|
"print(f\"Среднее значение поля 'Daily_Customer_Count': {average_count}\")\n",
|
|||
|
"\n",
|
|||
|
"# Создаем новую переменную, указывающую, превышает ли посещаемость среднюю\n",
|
|||
|
"df[\"above_average_count\"] = (df[\"Daily_Customer_Count\"] > average_count).astype(int)\n",
|
|||
|
"\n",
|
|||
|
"# Рассчитываем волатильность (разницу между максимальной и минимальной посещаемостью)\n",
|
|||
|
"df[\"customers_volatility\"] = df[\"Daily_Customer_Count\"].max() - df[\"Daily_Customer_Count\"].min()\n",
|
|||
|
"\n",
|
|||
|
"# Выводим первые строки измененной таблицы для проверки\n",
|
|||
|
"print(df.head())"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"2. Оптимизация параметров магазина:"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 69,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Средняя посещаемость для 'Store_Area':\n",
|
|||
|
"Store_Area\n",
|
|||
|
"775 1090.0\n",
|
|||
|
"780 790.0\n",
|
|||
|
"854 660.0\n",
|
|||
|
"869 850.0\n",
|
|||
|
"891 630.0\n",
|
|||
|
" ... \n",
|
|||
|
"2063 810.0\n",
|
|||
|
"2067 790.0\n",
|
|||
|
"2169 600.0\n",
|
|||
|
"2214 740.0\n",
|
|||
|
"2229 660.0\n",
|
|||
|
"Name: Daily_Customer_Count, Length: 583, dtype: float64\n",
|
|||
|
"\n",
|
|||
|
"Средняя посещаемость для 'Items_Available':\n",
|
|||
|
"Items_Available\n",
|
|||
|
"932 1090.0\n",
|
|||
|
"951 790.0\n",
|
|||
|
"1018 660.0\n",
|
|||
|
"1050 850.0\n",
|
|||
|
"1059 870.0\n",
|
|||
|
" ... \n",
|
|||
|
"2492 790.0\n",
|
|||
|
"2493 810.0\n",
|
|||
|
"2617 600.0\n",
|
|||
|
"2647 740.0\n",
|
|||
|
"2667 660.0\n",
|
|||
|
"Name: Daily_Customer_Count, Length: 616, dtype: float64\n",
|
|||
|
"\n",
|
|||
|
"Средняя посещаемость для 'Store_Sales':\n",
|
|||
|
"Store_Sales\n",
|
|||
|
"14920 990.0\n",
|
|||
|
"16370 880.0\n",
|
|||
|
"17670 660.0\n",
|
|||
|
"20270 870.0\n",
|
|||
|
"21300 850.0\n",
|
|||
|
" ... \n",
|
|||
|
"101820 820.0\n",
|
|||
|
"102310 1310.0\n",
|
|||
|
"102920 680.0\n",
|
|||
|
"105150 980.0\n",
|
|||
|
"116320 860.0\n",
|
|||
|
"Name: Daily_Customer_Count, Length: 816, dtype: float64\n",
|
|||
|
"\n",
|
|||
|
"Средняя посещаемость для комбинации 'Store_Area' и 'Items_Available':\n",
|
|||
|
"Store_Area Items_Available\n",
|
|||
|
"775 932 1090.0\n",
|
|||
|
"780 951 790.0\n",
|
|||
|
"854 1018 660.0\n",
|
|||
|
"869 1050 850.0\n",
|
|||
|
"891 1073 630.0\n",
|
|||
|
" ... \n",
|
|||
|
"2063 2493 810.0\n",
|
|||
|
"2067 2492 790.0\n",
|
|||
|
"2169 2617 600.0\n",
|
|||
|
"2214 2647 740.0\n",
|
|||
|
"2229 2667 660.0\n",
|
|||
|
"Name: Daily_Customer_Count, Length: 892, dtype: float64\n",
|
|||
|
"\n",
|
|||
|
"Средняя посещаемость для комбинации 'Store_Sales' и 'Items_Available':\n",
|
|||
|
"Store_Sales Items_Available\n",
|
|||
|
"14920 1508 990.0\n",
|
|||
|
"16370 1790 880.0\n",
|
|||
|
"17670 1877 660.0\n",
|
|||
|
"20270 1946 870.0\n",
|
|||
|
"21300 1686 850.0\n",
|
|||
|
" ... \n",
|
|||
|
"101820 1758 820.0\n",
|
|||
|
"102310 1587 1310.0\n",
|
|||
|
"102920 1638 680.0\n",
|
|||
|
"105150 2104 980.0\n",
|
|||
|
"116320 2414 860.0\n",
|
|||
|
"Name: Daily_Customer_Count, Length: 896, dtype: float64\n",
|
|||
|
"\n",
|
|||
|
"Средняя посещаемость для комбинации 'Store_Sales' и 'Store_Area':\n",
|
|||
|
"Store_Sales Store_Area\n",
|
|||
|
"14920 1250 990.0\n",
|
|||
|
"16370 1477 880.0\n",
|
|||
|
"17670 1537 660.0\n",
|
|||
|
"20270 1624 870.0\n",
|
|||
|
"21300 1397 850.0\n",
|
|||
|
" ... \n",
|
|||
|
"101820 1486 820.0\n",
|
|||
|
"102310 1303 1310.0\n",
|
|||
|
"102920 1365 680.0\n",
|
|||
|
"105150 1775 980.0\n",
|
|||
|
"116320 1989 860.0\n",
|
|||
|
"Name: Daily_Customer_Count, Length: 896, dtype: float64\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"\n",
|
|||
|
"# Загружаем набор данных\n",
|
|||
|
"df = pd.read_csv(\".//static//csv//Stores.csv\")\n",
|
|||
|
"\n",
|
|||
|
"# Устанавливаем случайное состояние\n",
|
|||
|
"random_state = 42\n",
|
|||
|
"\n",
|
|||
|
"# Рассчитываем среднюю посещаемость для каждого значения каждого признака\n",
|
|||
|
"for column in [\n",
|
|||
|
" \"Store_Area\",\n",
|
|||
|
" \"Items_Available\",\n",
|
|||
|
" \"Store_Sales\"\n",
|
|||
|
"]:\n",
|
|||
|
" print(f\"Средняя посещаемость для '{column}':\")\n",
|
|||
|
" print(df.groupby(column)[\"Daily_Customer_Count\"].mean())\n",
|
|||
|
" print()\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"print(\"Средняя посещаемость для комбинации 'Store_Area' и 'Items_Available':\")\n",
|
|||
|
"print(df.groupby([\"Store_Area\", \"Items_Available\"])[\"Daily_Customer_Count\"].mean())\n",
|
|||
|
"print()\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"print(\"Средняя посещаемость для комбинации 'Store_Sales' и 'Items_Available':\")\n",
|
|||
|
"print(df.groupby([\"Store_Sales\", \"Items_Available\"])[\"Daily_Customer_Count\"].mean())\n",
|
|||
|
"print()\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"print(\"Средняя посещаемость для комбинации 'Store_Sales' и 'Store_Area':\")\n",
|
|||
|
"print(df.groupby([\"Store_Sales\", \"Store_Area\"])[\"Daily_Customer_Count\"].mean())\n",
|
|||
|
"print()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### Выбор ориентира:\n",
|
|||
|
"1. Прогнозирование стоимости страховых взносов:\n",
|
|||
|
"Ориентир:\n",
|
|||
|
"\n",
|
|||
|
"R² (коэффициент детерминации): 0.75 - 0.85\n",
|
|||
|
"\n",
|
|||
|
"MAE (средняя абсолютная ошибка): 150 - 300 человек\n",
|
|||
|
"\n",
|
|||
|
"RMSE (среднеквадратичная ошибка): 175 - 315 человек\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 70,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"MAE: 241.24369535006045\n",
|
|||
|
"MSE: 82946.49105226391\n",
|
|||
|
"RMSE: 288.004324711043\n",
|
|||
|
"R²: -0.008816097180501359\n",
|
|||
|
"Ориентиры для прогнозирования не достигнуты.\n",
|
|||
|
"Средняя посещаемость 'Store_Area':\n",
|
|||
|
"Store_Area\n",
|
|||
|
"775 1090.0\n",
|
|||
|
"780 790.0\n",
|
|||
|
"854 660.0\n",
|
|||
|
"869 850.0\n",
|
|||
|
"891 630.0\n",
|
|||
|
" ... \n",
|
|||
|
"2063 810.0\n",
|
|||
|
"2067 790.0\n",
|
|||
|
"2169 600.0\n",
|
|||
|
"2214 740.0\n",
|
|||
|
"2229 660.0\n",
|
|||
|
"Name: Daily_Customer_Count, Length: 583, dtype: float64\n",
|
|||
|
"\n",
|
|||
|
"Средняя посещаемость 'Items_Available':\n",
|
|||
|
"Items_Available\n",
|
|||
|
"932 1090.0\n",
|
|||
|
"951 790.0\n",
|
|||
|
"1018 660.0\n",
|
|||
|
"1050 850.0\n",
|
|||
|
"1059 870.0\n",
|
|||
|
" ... \n",
|
|||
|
"2492 790.0\n",
|
|||
|
"2493 810.0\n",
|
|||
|
"2617 600.0\n",
|
|||
|
"2647 740.0\n",
|
|||
|
"2667 660.0\n",
|
|||
|
"Name: Daily_Customer_Count, Length: 616, dtype: float64\n",
|
|||
|
"\n",
|
|||
|
"Средняя посещаемость 'Store_Sales':\n",
|
|||
|
"Store_Sales\n",
|
|||
|
"14920 990.0\n",
|
|||
|
"16370 880.0\n",
|
|||
|
"17670 660.0\n",
|
|||
|
"20270 870.0\n",
|
|||
|
"21300 850.0\n",
|
|||
|
" ... \n",
|
|||
|
"101820 820.0\n",
|
|||
|
"102310 1310.0\n",
|
|||
|
"102920 680.0\n",
|
|||
|
"105150 980.0\n",
|
|||
|
"116320 860.0\n",
|
|||
|
"Name: Daily_Customer_Count, Length: 816, dtype: float64\n",
|
|||
|
"\n",
|
|||
|
"Средняя стоимость страховых взносов для комбинации 'Store_Area' и 'Items_Available':\n",
|
|||
|
"Store_Area Items_Available\n",
|
|||
|
"775 932 1090.0\n",
|
|||
|
"780 951 790.0\n",
|
|||
|
"854 1018 660.0\n",
|
|||
|
"869 1050 850.0\n",
|
|||
|
"891 1073 630.0\n",
|
|||
|
" ... \n",
|
|||
|
"2063 2493 810.0\n",
|
|||
|
"2067 2492 790.0\n",
|
|||
|
"2169 2617 600.0\n",
|
|||
|
"2214 2647 740.0\n",
|
|||
|
"2229 2667 660.0\n",
|
|||
|
"Name: Daily_Customer_Count, Length: 892, dtype: float64\n",
|
|||
|
"\n",
|
|||
|
"Средняя стоимость страховых взносов для комбинации 'Items_Available' и 'Store_Sales':\n",
|
|||
|
"Items_Available Store_Sales\n",
|
|||
|
"932 42530 1090.0\n",
|
|||
|
"951 25600 790.0\n",
|
|||
|
"1018 77740 660.0\n",
|
|||
|
"1050 52540 850.0\n",
|
|||
|
"1059 75110 870.0\n",
|
|||
|
" ... \n",
|
|||
|
"2492 70230 790.0\n",
|
|||
|
"2493 51480 810.0\n",
|
|||
|
"2617 67080 600.0\n",
|
|||
|
"2647 65900 740.0\n",
|
|||
|
"2667 87410 660.0\n",
|
|||
|
"Name: Daily_Customer_Count, Length: 896, dtype: float64\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
|||
|
" warnings.warn(\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"from sklearn.preprocessing import StandardScaler\n",
|
|||
|
"from sklearn.linear_model import LinearRegression\n",
|
|||
|
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
|
|||
|
"\n",
|
|||
|
"# Загружаем набор данных\n",
|
|||
|
"df = pd.read_csv(\".//static//csv//Stores.csv\")\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"# Разделяем данные на признаки (X) и целевую переменную (y)\n",
|
|||
|
"X = df.drop(\"Daily_Customer_Count\", axis=1)\n",
|
|||
|
"y = df[\"Daily_Customer_Count\"]\n",
|
|||
|
"\n",
|
|||
|
"# Разделяем данные на обучающую и тестовую выборки\n",
|
|||
|
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Стандартизируем признаки\n",
|
|||
|
"scaler = StandardScaler()\n",
|
|||
|
"X_train = scaler.fit_transform(X_train)\n",
|
|||
|
"X_test = scaler.transform(X_test)\n",
|
|||
|
"\n",
|
|||
|
"# Обучаем модель линейной регрессии\n",
|
|||
|
"model = LinearRegression()\n",
|
|||
|
"model.fit(X_train, y_train)\n",
|
|||
|
"\n",
|
|||
|
"# Делаем предсказания на тестовой выборке\n",
|
|||
|
"y_pred = model.predict(X_test)\n",
|
|||
|
"\n",
|
|||
|
"# Оцениваем качество модели\n",
|
|||
|
"mae = mean_absolute_error(y_test, y_pred)\n",
|
|||
|
"mse = mean_squared_error(y_test, y_pred)\n",
|
|||
|
"rmse = mean_squared_error(y_test, y_pred, squared=False)\n",
|
|||
|
"r2 = r2_score(y_test, y_pred)\n",
|
|||
|
"\n",
|
|||
|
"print(f\"MAE: {mae}\")\n",
|
|||
|
"print(f\"MSE: {mse}\")\n",
|
|||
|
"print(f\"RMSE: {rmse}\")\n",
|
|||
|
"print(f\"R²: {r2}\")\n",
|
|||
|
"\n",
|
|||
|
"# Проверяем, достигнуты ли ориентиры\n",
|
|||
|
"if r2 >= 0.75 and mae <= 300 and rmse <= 350:\n",
|
|||
|
" print(\"Ориентиры для прогнозирования достигнуты!\")\n",
|
|||
|
"else:\n",
|
|||
|
" print(\"Ориентиры для прогнозирования не достигнуты.\")\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"columns_to_group = [\n",
|
|||
|
" \"Store_Area\",\n",
|
|||
|
" \"Items_Available\",\n",
|
|||
|
" \"Store_Sales\"\n",
|
|||
|
"]\n",
|
|||
|
"\n",
|
|||
|
"# Рассчитываем среднюю посещаемость для каждого значения каждого признака\n",
|
|||
|
"for column in columns_to_group:\n",
|
|||
|
" print(f\"Средняя посещаемость '{column}':\")\n",
|
|||
|
" print(df.groupby(column)[\"Daily_Customer_Count\"].mean())\n",
|
|||
|
" print()\n",
|
|||
|
"\n",
|
|||
|
"# Рассчитываем среднюю посещаемость для комбинаций признаков\n",
|
|||
|
"\n",
|
|||
|
"print(\n",
|
|||
|
" \"Средняя стоимость страховых взносов для комбинации 'Store_Area' и 'Items_Available':\"\n",
|
|||
|
")\n",
|
|||
|
"print(df.groupby([\"Store_Area\", \"Items_Available\"])[\"Daily_Customer_Count\"].mean())\n",
|
|||
|
"print()\n",
|
|||
|
"\n",
|
|||
|
"print(\n",
|
|||
|
" \"Средняя стоимость страховых взносов для комбинации 'Items_Available' и 'Store_Sales':\"\n",
|
|||
|
")\n",
|
|||
|
"print(df.groupby([\"Items_Available\", \"Store_Sales\"])[\"Daily_Customer_Count\"].mean())\n",
|
|||
|
"print()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### Анализ применимости алгоритмов обучения с учителем для решения поставленных задач:\n",
|
|||
|
"1. Прогнозирование посещаемости магазинов:\n",
|
|||
|
"Задача: Регрессия\n",
|
|||
|
"\n",
|
|||
|
"Свойства алгоритмов:\n",
|
|||
|
"\n",
|
|||
|
"Линейная регрессия:\n",
|
|||
|
"Применимость: Хорошо подходит для задач, где зависимость между признаками и целевой переменной линейна.\n",
|
|||
|
"Преимущества: Проста в реализации, интерпретируема.\n",
|
|||
|
"Недостатки: Может плохо работать, если зависимость нелинейна.\n",
|
|||
|
"\n",
|
|||
|
"Деревья решений (регрессия):\n",
|
|||
|
"Применимость: Подходит для задач с нелинейными зависимостями.\n",
|
|||
|
"Преимущества: Может обрабатывать категориальные признаки, не требует масштабирования данных.\n",
|
|||
|
"Недостатки: Подвержены переобучению, могут давать нестабильные результаты.\n",
|
|||
|
"\n",
|
|||
|
"Случайный лес (регрессия):\n",
|
|||
|
"Применимость: Хорошо подходит для задач с нелинейными зависимостями и большим количеством признаков.\n",
|
|||
|
"Преимущества: Устойчив к переобучению, может обрабатывать категориальные признаки.\n",
|
|||
|
"Недостатки: Менее интерпретируем, чем линейная регрессия.\n",
|
|||
|
"\n",
|
|||
|
"Градиентный бустинг (регрессия):\n",
|
|||
|
"Применимость: Подходит для задач с нелинейными зависимостями и сложными взаимосвязями между признаками.\n",
|
|||
|
"Преимущества: Может достигать высокой точности, устойчив к переобучению.\n",
|
|||
|
"Недостатки: Сложнее в настройке, чем случайный лес, менее интерпретируем.\n",
|
|||
|
"\n",
|
|||
|
"Нейронные сети (регрессия):\n",
|
|||
|
"Применимость: Подходит для задач с очень сложными зависимостями и большим количеством данных.\n",
|
|||
|
"Преимущества: Может моделировать очень сложные зависимости.\n",
|
|||
|
"Недостатки: Требует большого количества данных, сложнее в настройке и интерпретации.\n",
|
|||
|
"\n",
|
|||
|
"Вывод:\n",
|
|||
|
"\n",
|
|||
|
"Линейная регрессия: Может быть хорошим выбором для начала, особенно если зависимость между признаками и целевой переменной линейна.\n",
|
|||
|
"\n",
|
|||
|
"Деревья решений и случайный лес: Подходят для задач с нелинейными зависимостями.\n",
|
|||
|
"\n",
|
|||
|
"Градиентный бустинг: Может давать более высокую точность, чем случайный лес, но требует больше времени на настройку.\n",
|
|||
|
"\n",
|
|||
|
"Нейронные сети: Могут быть излишними для этой задачи, если данных недостаточно много.\n",
|
|||
|
"\n",
|
|||
|
"2. Оптимизация тарифной сетки:\n",
|
|||
|
"Задача: Классификация (группировка клиентов по группам риска)\n",
|
|||
|
"\n",
|
|||
|
"Свойства алгоритмов:\n",
|
|||
|
"\n",
|
|||
|
"Логистическая регрессия:\n",
|
|||
|
"Применимость: Хорошо подходит для задач бинарной классификации, где зависимость между признаками и целевой переменной линейна.\n",
|
|||
|
"Преимущества: Проста в реализации, интерпретируема.\n",
|
|||
|
"Недостатки: Может плохо работать, если зависимость нелинейна.\n",
|
|||
|
"\n",
|
|||
|
"Деревья решений (классификация):\n",
|
|||
|
"Применимость: Подходит для задач с нелинейными зависимостями.\n",
|
|||
|
"Преимущества: Может обрабатывать категориальные признаки, не требует масштабирования данных.\n",
|
|||
|
"Недостатки: Подвержены переобучению, могут давать нестабильные результаты.\n",
|
|||
|
"\n",
|
|||
|
"Случайный лес (классификация):\n",
|
|||
|
"Применимость: Хорошо подходит для задач с нелинейными зависимостями и большим количеством признаков.\n",
|
|||
|
"Преимущества: Устойчив к переобучению, может обрабатывать категориальные признаки.\n",
|
|||
|
"Недостатки: Менее интерпретируем, чем линейная регрессия.\n",
|
|||
|
"\n",
|
|||
|
"Градиентный бустинг (классификация):\n",
|
|||
|
"Применимость: Подходит для задач с нелинейными зависимостями и сложными взаимосвязями между признаками.\n",
|
|||
|
"Преимущества: Может достигать высокой точности, устойчив к переобучению.\n",
|
|||
|
"Недостатки: Сложнее в настройке, чем случайный лес, менее интерпретируем.\n",
|
|||
|
"\n",
|
|||
|
"Нейронные сети (классификация):\n",
|
|||
|
"Применимость: Подходит для задач с очень сложными зависимостями и большим количеством данных.\n",
|
|||
|
"Преимущества: Может моделировать очень сложные зависимости.\n",
|
|||
|
"Недостатки: Требует большого количества данных, сложнее в настройке и интерпретации.\n",
|
|||
|
"\n",
|
|||
|
"Вывод:\n",
|
|||
|
"\n",
|
|||
|
"Логистическая регрессия: Может быть хорошим выбором для начала, особенно если зависимость между признаками и целевой переменной линейна.\n",
|
|||
|
"\n",
|
|||
|
"Деревья решений и случайный лес: Подходят для задач с нелинейными зависимостями.\n",
|
|||
|
"\n",
|
|||
|
"Градиентный бустинг: Может давать более высокую точность, чем случайный лес, но требует больше времени на настройку.\n",
|
|||
|
"\n",
|
|||
|
"Нейронные сети: Могут быть излишними для этой задачи, если данных недостаточно много.\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"1. Прогнозирование стоимости страховых взносов:\n",
|
|||
|
"Выбранные модели:\n",
|
|||
|
"\n",
|
|||
|
"Линейная регрессия\n",
|
|||
|
"\n",
|
|||
|
"Случайный лес (регрессия)\n",
|
|||
|
"\n",
|
|||
|
"Градиентный бустинг (регрессия)\n",
|
|||
|
"\n",
|
|||
|
"2. Оптимизация тарифной сетки:\n",
|
|||
|
"Выбранные модели:\n",
|
|||
|
"\n",
|
|||
|
"Логистическая регрессия\n",
|
|||
|
"\n",
|
|||
|
"Случайный лес (классификация)\n",
|
|||
|
"\n",
|
|||
|
"Градиентный бустинг (классификация)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 71,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Результаты для задачи регрессии:\n",
|
|||
|
"Model: Linear Regression\n",
|
|||
|
"MAE: 241.24369535006045\n",
|
|||
|
"MSE: 82946.49105226391\n",
|
|||
|
"RMSE: 288.004324711043\n",
|
|||
|
"R²: -0.008816097180501359\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
|||
|
" warnings.warn(\n",
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
|||
|
" warnings.warn(\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Model: Random Forest Regression\n",
|
|||
|
"MAE: 240.68666666666667\n",
|
|||
|
"MSE: 85748.043\n",
|
|||
|
"RMSE: 292.82766775016324\n",
|
|||
|
"R²: -0.042889276963148815\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
|||
|
" warnings.warn(\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Model: Gradient Boosting Regression\n",
|
|||
|
"MAE: 243.53822748120598\n",
|
|||
|
"MSE: 86937.70201509264\n",
|
|||
|
"RMSE: 294.85200018838714\n",
|
|||
|
"R²: -0.05735820927548918\n",
|
|||
|
"\n",
|
|||
|
"Результаты для задачи классификации:\n",
|
|||
|
"Model: Logistic Regression\n",
|
|||
|
"Accuracy: 0.43333333333333335\n",
|
|||
|
"\n",
|
|||
|
"Model: Random Forest Classification\n",
|
|||
|
"Accuracy: 0.4777777777777778\n",
|
|||
|
"\n",
|
|||
|
"Model: Gradient Boosting Classification\n",
|
|||
|
"Accuracy: 0.4888888888888889\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"from sklearn.preprocessing import StandardScaler\n",
|
|||
|
"from sklearn.linear_model import LinearRegression, LogisticRegression\n",
|
|||
|
"from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n",
|
|||
|
"from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier\n",
|
|||
|
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score\n",
|
|||
|
"\n",
|
|||
|
"# Загружаем набор данных\n",
|
|||
|
"df = pd.read_csv(\".//static//csv//Stores.csv\")\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи регрессии\n",
|
|||
|
"X_reg = df.drop(\"Daily_Customer_Count\", axis=1)\n",
|
|||
|
"y_reg = df[\"Daily_Customer_Count\"]\n",
|
|||
|
"\n",
|
|||
|
"# Разделяем данные на обучающую и тестовую выборки для задачи регрессии\n",
|
|||
|
"X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Стандартизируем признаки для задачи регрессии\n",
|
|||
|
"scaler_reg = StandardScaler()\n",
|
|||
|
"X_train_reg = scaler_reg.fit_transform(X_train_reg)\n",
|
|||
|
"X_test_reg = scaler_reg.transform(X_test_reg)\n",
|
|||
|
"\n",
|
|||
|
"# Список моделей для задачи регрессии\n",
|
|||
|
"models_reg = {\n",
|
|||
|
" \"Linear Regression\": LinearRegression(),\n",
|
|||
|
" \"Random Forest Regression\": RandomForestRegressor(),\n",
|
|||
|
" \"Gradient Boosting Regression\": GradientBoostingRegressor()\n",
|
|||
|
"}\n",
|
|||
|
"\n",
|
|||
|
"# Обучаем и оцениваем модели для задачи регрессии\n",
|
|||
|
"print(\"Результаты для задачи регрессии:\")\n",
|
|||
|
"for name, model in models_reg.items():\n",
|
|||
|
" model.fit(X_train_reg, y_train_reg)\n",
|
|||
|
" y_pred_reg = model.predict(X_test_reg)\n",
|
|||
|
" mae = mean_absolute_error(y_test_reg, y_pred_reg)\n",
|
|||
|
" mse = mean_squared_error(y_test_reg, y_pred_reg)\n",
|
|||
|
" rmse = mean_squared_error(y_test_reg, y_pred_reg, squared=False)\n",
|
|||
|
" r2 = r2_score(y_test_reg, y_pred_reg)\n",
|
|||
|
" print(f\"Model: {name}\")\n",
|
|||
|
" print(f\"MAE: {mae}\")\n",
|
|||
|
" print(f\"MSE: {mse}\")\n",
|
|||
|
" print(f\"RMSE: {rmse}\")\n",
|
|||
|
" print(f\"R²: {r2}\")\n",
|
|||
|
" print()\n",
|
|||
|
"\n",
|
|||
|
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи классификации\n",
|
|||
|
"X_class = df.drop(\"Daily_Customer_Count\", axis=1)\n",
|
|||
|
"y_class = (df[\"Daily_Customer_Count\"] > df[\"Daily_Customer_Count\"].mean()).astype(int)\n",
|
|||
|
"\n",
|
|||
|
"# Разделяем данные на обучающую и тестовую выборки для задачи классификации\n",
|
|||
|
"X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Стандартизируем признаки для задачи классификации\n",
|
|||
|
"scaler_class = StandardScaler()\n",
|
|||
|
"X_train_class = scaler_class.fit_transform(X_train_class)\n",
|
|||
|
"X_test_class = scaler_class.transform(X_test_class)\n",
|
|||
|
"\n",
|
|||
|
"# Список моделей для задачи классификации\n",
|
|||
|
"models_class = {\n",
|
|||
|
" \"Logistic Regression\": LogisticRegression(),\n",
|
|||
|
" \"Random Forest Classification\": RandomForestClassifier(),\n",
|
|||
|
" \"Gradient Boosting Classification\": GradientBoostingClassifier()\n",
|
|||
|
"}\n",
|
|||
|
"\n",
|
|||
|
"# Обучаем и оцениваем модели для задачи классификации\n",
|
|||
|
"print(\"Результаты для задачи классификации:\")\n",
|
|||
|
"for name, model in models_class.items():\n",
|
|||
|
" model.fit(X_train_class, y_train_class)\n",
|
|||
|
" y_pred_class = model.predict(X_test_class)\n",
|
|||
|
" accuracy = accuracy_score(y_test_class, y_pred_class)\n",
|
|||
|
" print(f\"Model: {name}\")\n",
|
|||
|
" print(f\"Accuracy: {accuracy}\")\n",
|
|||
|
" print()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"1. Прогнозирование стоимости страховых взносов:\n",
|
|||
|
"Конвейер для задачи регрессии:"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 72,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Результаты для задачи регрессии:\n",
|
|||
|
"Model: Linear Regression\n",
|
|||
|
"MAE: 240.99246411452697\n",
|
|||
|
"MSE: 82771.10925011222\n",
|
|||
|
"RMSE: 287.6996858707222\n",
|
|||
|
"R²: -0.0066830595689202354\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
|||
|
" warnings.warn(\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Model: Random Forest Regression\n",
|
|||
|
"MAE: 247.89333333333335\n",
|
|||
|
"MSE: 94993.29455555555\n",
|
|||
|
"RMSE: 308.2098222892248\n",
|
|||
|
"R²: -0.15533235289568936\n",
|
|||
|
"\n",
|
|||
|
"Model: Gradient Boosting Regression\n",
|
|||
|
"MAE: 251.77123469394226\n",
|
|||
|
"MSE: 91978.0886332414\n",
|
|||
|
"RMSE: 303.27889579270334\n",
|
|||
|
"R²: -0.11866065970944106\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
|||
|
" warnings.warn(\n",
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
|||
|
" warnings.warn(\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"from sklearn.preprocessing import StandardScaler\n",
|
|||
|
"from sklearn.linear_model import LinearRegression\n",
|
|||
|
"from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor\n",
|
|||
|
"from sklearn.pipeline import Pipeline\n",
|
|||
|
"from sklearn.compose import ColumnTransformer\n",
|
|||
|
"from sklearn.preprocessing import OneHotEncoder\n",
|
|||
|
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
|
|||
|
"\n",
|
|||
|
"# Загружаем набор данных\n",
|
|||
|
"df = pd.read_csv(\".//static//csv//Stores.csv\")\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"numerical_cols = [\"Store_Area\", \"Items_Available\", \"Store_Sales\"]\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"preprocessor = ColumnTransformer(\n",
|
|||
|
" transformers=[\n",
|
|||
|
" ('num', StandardScaler(), numerical_cols)\n",
|
|||
|
" ])\n",
|
|||
|
"\n",
|
|||
|
"# Список моделей для задачи регрессии\n",
|
|||
|
"models_reg = {\n",
|
|||
|
" \"Linear Regression\": LinearRegression(),\n",
|
|||
|
" \"Random Forest Regression\": RandomForestRegressor(),\n",
|
|||
|
" \"Gradient Boosting Regression\": GradientBoostingRegressor()\n",
|
|||
|
"}\n",
|
|||
|
"\n",
|
|||
|
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи регрессии\n",
|
|||
|
"X_reg = df[numerical_cols]\n",
|
|||
|
"y_reg = df[\"Daily_Customer_Count\"]\n",
|
|||
|
"\n",
|
|||
|
"# Разделяем данные на обучающую и тестовую выборки для задачи регрессии\n",
|
|||
|
"X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Обучаем и оцениваем модели для задачи регрессии\n",
|
|||
|
"print(\"Результаты для задачи регрессии:\")\n",
|
|||
|
"for name, model in models_reg.items():\n",
|
|||
|
" pipeline = Pipeline(steps=[\n",
|
|||
|
" ('preprocessor', preprocessor),\n",
|
|||
|
" ('model', model)\n",
|
|||
|
" ])\n",
|
|||
|
" pipeline.fit(X_train_reg, y_train_reg)\n",
|
|||
|
" y_pred_reg = pipeline.predict(X_test_reg)\n",
|
|||
|
" mae = mean_absolute_error(y_test_reg, y_pred_reg)\n",
|
|||
|
" mse = mean_squared_error(y_test_reg, y_pred_reg)\n",
|
|||
|
" rmse = mean_squared_error(y_test_reg, y_pred_reg, squared=False)\n",
|
|||
|
" r2 = r2_score(y_test_reg, y_pred_reg)\n",
|
|||
|
" print(f\"Model: {name}\")\n",
|
|||
|
" print(f\"MAE: {mae}\")\n",
|
|||
|
" print(f\"MSE: {mse}\")\n",
|
|||
|
" print(f\"RMSE: {rmse}\")\n",
|
|||
|
" print(f\"R²: {r2}\")\n",
|
|||
|
" print()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"2. Оптимизация характеристик магазина:\n",
|
|||
|
"Конвейер для задачи классификации:"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 73,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Результаты для задачи классификации:\n",
|
|||
|
"Model: Logistic Regression\n",
|
|||
|
"Accuracy: 0.46111111111111114\n",
|
|||
|
"\n",
|
|||
|
"Model: Random Forest Classification\n",
|
|||
|
"Accuracy: 0.4722222222222222\n",
|
|||
|
"\n",
|
|||
|
"Model: Gradient Boosting Classification\n",
|
|||
|
"Accuracy: 0.4722222222222222\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"from sklearn.preprocessing import StandardScaler\n",
|
|||
|
"from sklearn.linear_model import LogisticRegression\n",
|
|||
|
"from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n",
|
|||
|
"from sklearn.pipeline import Pipeline\n",
|
|||
|
"from sklearn.compose import ColumnTransformer\n",
|
|||
|
"from sklearn.preprocessing import OneHotEncoder\n",
|
|||
|
"from sklearn.metrics import accuracy_score\n",
|
|||
|
"\n",
|
|||
|
"# Загружаем набор данных\n",
|
|||
|
"df = pd.read_csv(\".//static//csv//Stores.csv\")\n",
|
|||
|
"\n",
|
|||
|
"numerical_cols = [\"Store_Area\", \"Items_Available\", \"Store_Sales\"]\n",
|
|||
|
"\n",
|
|||
|
"# Создаем преобразователь для категориальных и числовых столбцов\n",
|
|||
|
"preprocessor = ColumnTransformer(\n",
|
|||
|
" transformers=[\n",
|
|||
|
" ('num', StandardScaler(), numerical_cols)\n",
|
|||
|
" ])\n",
|
|||
|
"\n",
|
|||
|
"# Список моделей для задачи классификации\n",
|
|||
|
"models_class = {\n",
|
|||
|
" \"Logistic Regression\": LogisticRegression(),\n",
|
|||
|
" \"Random Forest Classification\": RandomForestClassifier(),\n",
|
|||
|
" \"Gradient Boosting Classification\": GradientBoostingClassifier()\n",
|
|||
|
"}\n",
|
|||
|
"\n",
|
|||
|
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи классификации\n",
|
|||
|
"X_class = df[numerical_cols]\n",
|
|||
|
"y_class = (df[\"Daily_Customer_Count\"] > df[\"Daily_Customer_Count\"].mean()).astype(int)\n",
|
|||
|
"\n",
|
|||
|
"# Разделяем данные на обучающую и тестовую выборки для задачи классификации\n",
|
|||
|
"X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Обучаем и оцениваем модели для задачи классификации\n",
|
|||
|
"print(\"Результаты для задачи классификации:\")\n",
|
|||
|
"for name, model in models_class.items():\n",
|
|||
|
" pipeline = Pipeline(steps=[\n",
|
|||
|
" ('preprocessor', preprocessor),\n",
|
|||
|
" ('model', model)\n",
|
|||
|
" ])\n",
|
|||
|
" pipeline.fit(X_train_class, y_train_class)\n",
|
|||
|
" y_pred_class = pipeline.predict(X_test_class)\n",
|
|||
|
" accuracy = accuracy_score(y_test_class, y_pred_class)\n",
|
|||
|
" print(f\"Model: {name}\")\n",
|
|||
|
" print(f\"Accuracy: {accuracy}\")\n",
|
|||
|
" print()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"1. Прогнозирование посещения:\n",
|
|||
|
"\n",
|
|||
|
"Настройка гиперпараметров для задачи регрессии:"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 74,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Результаты для задачи регрессии:\n",
|
|||
|
"Model: Linear Regression\n",
|
|||
|
"Best Parameters: {}\n",
|
|||
|
"MAE: 240.99246411452697\n",
|
|||
|
"MSE: 82771.10925011222\n",
|
|||
|
"RMSE: 287.6996858707222\n",
|
|||
|
"R²: -0.0066830595689202354\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
|||
|
" warnings.warn(\n",
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
|||
|
" warnings.warn(\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Model: Random Forest Regression\n",
|
|||
|
"Best Parameters: {'model__max_depth': 10, 'model__n_estimators': 100}\n",
|
|||
|
"MAE: 242.55834193962204\n",
|
|||
|
"MSE: 87591.55194330998\n",
|
|||
|
"RMSE: 295.9586997256712\n",
|
|||
|
"R²: -0.06531049664000643\n",
|
|||
|
"\n",
|
|||
|
"Model: Gradient Boosting Regression\n",
|
|||
|
"Best Parameters: {'model__learning_rate': 0.01, 'model__max_depth': 3, 'model__n_estimators': 100}\n",
|
|||
|
"MAE: 241.05326789654333\n",
|
|||
|
"MSE: 82428.16277986151\n",
|
|||
|
"RMSE: 287.1030525436146\n",
|
|||
|
"R²: -0.0025120582972431027\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
|||
|
" warnings.warn(\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"from sklearn.model_selection import train_test_split, GridSearchCV\n",
|
|||
|
"from sklearn.preprocessing import StandardScaler\n",
|
|||
|
"from sklearn.linear_model import LinearRegression\n",
|
|||
|
"from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor\n",
|
|||
|
"from sklearn.pipeline import Pipeline\n",
|
|||
|
"from sklearn.compose import ColumnTransformer\n",
|
|||
|
"from sklearn.preprocessing import OneHotEncoder\n",
|
|||
|
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
|
|||
|
"\n",
|
|||
|
"# Загружаем набор данных\n",
|
|||
|
"df = pd.read_csv(\".//static//csv//Stores.csv\")\n",
|
|||
|
"\n",
|
|||
|
"# Определяем категориальные и числовые столбцы\n",
|
|||
|
"\n",
|
|||
|
"numerical_cols = [\"Store_Area\", \"Items_Available\", \"Store_Sales\"]\n",
|
|||
|
"\n",
|
|||
|
"# Создаем преобразователь для категориальных и числовых столбцов\n",
|
|||
|
"preprocessor = ColumnTransformer(\n",
|
|||
|
" transformers=[\n",
|
|||
|
" ('num', StandardScaler(), numerical_cols)\n",
|
|||
|
" ])\n",
|
|||
|
"\n",
|
|||
|
"# Список моделей и их гиперпараметров для задачи регрессии\n",
|
|||
|
"models_reg = {\n",
|
|||
|
" \"Linear Regression\": (LinearRegression(), {}),\n",
|
|||
|
" \"Random Forest Regression\": (RandomForestRegressor(), {\n",
|
|||
|
" 'model__n_estimators': [100, 200],\n",
|
|||
|
" 'model__max_depth': [None, 10, 20]\n",
|
|||
|
" }),\n",
|
|||
|
" \"Gradient Boosting Regression\": (GradientBoostingRegressor(), {\n",
|
|||
|
" 'model__n_estimators': [100, 200],\n",
|
|||
|
" 'model__learning_rate': [0.01, 0.1],\n",
|
|||
|
" 'model__max_depth': [3, 5]\n",
|
|||
|
" })\n",
|
|||
|
"}\n",
|
|||
|
"\n",
|
|||
|
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи регрессии\n",
|
|||
|
"X_reg = df[numerical_cols]\n",
|
|||
|
"y_reg = df['Daily_Customer_Count']\n",
|
|||
|
"\n",
|
|||
|
"# Разделяем данные на обучающую и тестовую выборки для задачи регрессии\n",
|
|||
|
"X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Обучаем и оцениваем модели для задачи регрессии\n",
|
|||
|
"print(\"Результаты для задачи регрессии:\")\n",
|
|||
|
"for name, (model, params) in models_reg.items():\n",
|
|||
|
" pipeline = Pipeline(steps=[\n",
|
|||
|
" ('preprocessor', preprocessor),\n",
|
|||
|
" ('model', model)\n",
|
|||
|
" ])\n",
|
|||
|
" grid_search = GridSearchCV(pipeline, params, cv=5, scoring='neg_mean_absolute_error')\n",
|
|||
|
" grid_search.fit(X_train_reg, y_train_reg)\n",
|
|||
|
" best_model = grid_search.best_estimator_\n",
|
|||
|
" y_pred_reg = best_model.predict(X_test_reg)\n",
|
|||
|
" mae = mean_absolute_error(y_test_reg, y_pred_reg)\n",
|
|||
|
" mse = mean_squared_error(y_test_reg, y_pred_reg)\n",
|
|||
|
" rmse = mean_squared_error(y_test_reg, y_pred_reg, squared=False)\n",
|
|||
|
" r2 = r2_score(y_test_reg, y_pred_reg)\n",
|
|||
|
" print(f\"Model: {name}\")\n",
|
|||
|
" print(f\"Best Parameters: {grid_search.best_params_}\")\n",
|
|||
|
" print(f\"MAE: {mae}\")\n",
|
|||
|
" print(f\"MSE: {mse}\")\n",
|
|||
|
" print(f\"RMSE: {rmse}\")\n",
|
|||
|
" print(f\"R²: {r2}\")\n",
|
|||
|
" print()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"2. Оптимизация характеристик:\n",
|
|||
|
"\n",
|
|||
|
"Настройка гиперпараметров для задачи классификации:"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 75,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Результаты для задачи классификации:\n",
|
|||
|
"Model: Logistic Regression\n",
|
|||
|
"Best Parameters: {'model__C': 10, 'model__solver': 'lbfgs'}\n",
|
|||
|
"Accuracy: 0.46111111111111114\n",
|
|||
|
"\n",
|
|||
|
"Model: Random Forest Classification\n",
|
|||
|
"Best Parameters: {'model__max_depth': 10, 'model__n_estimators': 100}\n",
|
|||
|
"Accuracy: 0.49444444444444446\n",
|
|||
|
"\n",
|
|||
|
"Model: Gradient Boosting Classification\n",
|
|||
|
"Best Parameters: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 100}\n",
|
|||
|
"Accuracy: 0.4777777777777778\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"from sklearn.model_selection import train_test_split, GridSearchCV\n",
|
|||
|
"from sklearn.preprocessing import StandardScaler\n",
|
|||
|
"from sklearn.linear_model import LogisticRegression\n",
|
|||
|
"from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n",
|
|||
|
"from sklearn.pipeline import Pipeline\n",
|
|||
|
"from sklearn.compose import ColumnTransformer\n",
|
|||
|
"from sklearn.preprocessing import OneHotEncoder\n",
|
|||
|
"from sklearn.metrics import accuracy_score\n",
|
|||
|
"\n",
|
|||
|
"# Загружаем набор данных\n",
|
|||
|
"df = pd.read_csv(\".//static//csv//Stores.csv\")\n",
|
|||
|
"\n",
|
|||
|
"# Определяем категориальные и числовые столбцы\n",
|
|||
|
"\n",
|
|||
|
"numerical_cols = [\"Store_Area\", \"Items_Available\", \"Store_Sales\"]\n",
|
|||
|
"\n",
|
|||
|
"# Создаем преобразователь для категориальных и числовых столбцов\n",
|
|||
|
"preprocessor = ColumnTransformer(\n",
|
|||
|
" transformers=[\n",
|
|||
|
" ('num', StandardScaler(), numerical_cols)\n",
|
|||
|
" ])\n",
|
|||
|
"\n",
|
|||
|
"# Список моделей и их гиперпараметров для задачи классификации\n",
|
|||
|
"models_class = {\n",
|
|||
|
" \"Logistic Regression\": (LogisticRegression(), {\n",
|
|||
|
" 'model__C': [0.1, 1, 10],\n",
|
|||
|
" 'model__solver': ['liblinear', 'lbfgs']\n",
|
|||
|
" }),\n",
|
|||
|
" \"Random Forest Classification\": (RandomForestClassifier(), {\n",
|
|||
|
" 'model__n_estimators': [100, 200],\n",
|
|||
|
" 'model__max_depth': [None, 10, 20]\n",
|
|||
|
" }),\n",
|
|||
|
" \"Gradient Boosting Classification\": (GradientBoostingClassifier(), {\n",
|
|||
|
" 'model__n_estimators': [100, 200],\n",
|
|||
|
" 'model__learning_rate': [0.01, 0.1],\n",
|
|||
|
" 'model__max_depth': [3, 5]\n",
|
|||
|
" })\n",
|
|||
|
"}\n",
|
|||
|
"\n",
|
|||
|
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи классификации\n",
|
|||
|
"X_class = df[numerical_cols]\n",
|
|||
|
"y_class = (df['Daily_Customer_Count'] > df['Daily_Customer_Count'].mean()).astype(int)\n",
|
|||
|
"\n",
|
|||
|
"# Разделяем данные на обучающую и тестовую выборки для задачи классификации\n",
|
|||
|
"X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Обучаем и оцениваем модели для задачи классификации\n",
|
|||
|
"print(\"Результаты для задачи классификации:\")\n",
|
|||
|
"for name, (model, params) in models_class.items():\n",
|
|||
|
" pipeline = Pipeline(steps=[\n",
|
|||
|
" ('preprocessor', preprocessor),\n",
|
|||
|
" ('model', model)\n",
|
|||
|
" ])\n",
|
|||
|
" grid_search = GridSearchCV(pipeline, params, cv=5, scoring='accuracy')\n",
|
|||
|
" grid_search.fit(X_train_class, y_train_class)\n",
|
|||
|
" best_model = grid_search.best_estimator_\n",
|
|||
|
" y_pred_class = best_model.predict(X_test_class)\n",
|
|||
|
" accuracy = accuracy_score(y_test_class, y_pred_class)\n",
|
|||
|
" print(f\"Model: {name}\")\n",
|
|||
|
" print(f\"Best Parameters: {grid_search.best_params_}\")\n",
|
|||
|
" print(f\"Accuracy: {accuracy}\")\n",
|
|||
|
" print()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"1. Прогнозирование посещаемости::\n",
|
|||
|
"Задача: Регрессия\n",
|
|||
|
"\n",
|
|||
|
"Выбор метрик:\n",
|
|||
|
"\n",
|
|||
|
"MAE (Mean Absolute Error): Средняя абсолютная ошибка. Показывает среднее отклонение предсказанных значений от фактических. Эта метрика легко интерпретируется, так как она измеряется в тех же единицах, что и целевая переменная \n",
|
|||
|
"\n",
|
|||
|
"MSE (Mean Squared Error): Среднеквадратичная ошибка. Показывает среднее квадратичное отклонение предсказанных значений от фактических. Эта метрика чувствительна к выбросам, так как ошибки возводятся в квадрат.\n",
|
|||
|
"\n",
|
|||
|
"RMSE (Root Mean Squared Error): Квадратный корень из среднеквадратичной ошибки. Показывает среднее отклонение предсказанных значений от фактических в тех же единицах, что и целевая переменная. Эта метрика также чувствительна к выбросам, но легче интерпретируется, чем MSE.\n",
|
|||
|
"\n",
|
|||
|
"R² (R-squared): Коэффициент детерминации. Показывает, какую долю дисперсии целевой переменной объясняет модель. Значение R² близкое к 1 указывает на хорошее качество модели.\n",
|
|||
|
"\n",
|
|||
|
"Обоснование:\n",
|
|||
|
"\n",
|
|||
|
"MAE: Хорошо подходит для задач, где важно понимать среднее отклонение предсказаний от фактических значений.\n",
|
|||
|
"\n",
|
|||
|
"MSE и RMSE: Полезны для задач, где важно минимизировать влияние выбросов, так как они возводят ошибки в квадрат.\n",
|
|||
|
"\n",
|
|||
|
"R²: Позволяет оценить, насколько хорошо модель объясняет вариацию целевой переменной.\n",
|
|||
|
"\n",
|
|||
|
"2. Оптимизация характеристик:\n",
|
|||
|
"Задача: Классификация\n",
|
|||
|
"\n",
|
|||
|
"Выбор метрик:\n",
|
|||
|
"\n",
|
|||
|
"Accuracy: Доля правильных предсказаний среди всех предсказаний. Эта метрика показывает общую точность модели.\n",
|
|||
|
"\n",
|
|||
|
"Precision: Доля правильных положительных предсказаний среди всех положительных предсказаний. Эта метрика важна, если важно минимизировать количество ложноположительных результатов.\n",
|
|||
|
"\n",
|
|||
|
"Recall (Sensitivity): Доля правильных положительных предсказаний среди всех фактических положительных случаев. Эта метрика важна, если важно минимизировать количество ложноотрицательных результатов.\n",
|
|||
|
"\n",
|
|||
|
"F1-score: Гармоническое среднее между precision и recall. Эта метрика показывает баланс между precision и recall.\n",
|
|||
|
"\n",
|
|||
|
"Обоснование:\n",
|
|||
|
"\n",
|
|||
|
"Accuracy: Хорошо подходит для задач, где классы сбалансированы.\n",
|
|||
|
"\n",
|
|||
|
"Precision и Recall: Важны для задач, где важно минимизировать ошибки определенного типа (ложноположительные или ложноотрицательные).\n",
|
|||
|
"\n",
|
|||
|
"F1-score: Позволяет оценить баланс между precision и recall."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 76,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Результаты для задачи регрессии:\n",
|
|||
|
"Model: Linear Regression\n",
|
|||
|
"Best Parameters: {}\n",
|
|||
|
"MAE: 240.99246411452697\n",
|
|||
|
"MSE: 82771.10925011222\n",
|
|||
|
"RMSE: 287.6996858707222\n",
|
|||
|
"R²: -0.0066830595689202354\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
|||
|
" warnings.warn(\n",
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
|||
|
" warnings.warn(\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Model: Random Forest Regression\n",
|
|||
|
"Best Parameters: {'model__max_depth': 10, 'model__n_estimators': 200}\n",
|
|||
|
"MAE: 244.5229418633195\n",
|
|||
|
"MSE: 87788.51054250356\n",
|
|||
|
"RMSE: 296.29125964581465\n",
|
|||
|
"R²: -0.06770595668688673\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
|||
|
" warnings.warn(\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Model: Gradient Boosting Regression\n",
|
|||
|
"Best Parameters: {'model__learning_rate': 0.01, 'model__max_depth': 3, 'model__n_estimators': 100}\n",
|
|||
|
"MAE: 240.99176421026533\n",
|
|||
|
"MSE: 82412.10586641222\n",
|
|||
|
"RMSE: 287.075087505712\n",
|
|||
|
"R²: -0.002316770075243779\n",
|
|||
|
"\n",
|
|||
|
"Результаты для задачи классификации:\n",
|
|||
|
"Model: Logistic Regression\n",
|
|||
|
"Best Parameters: {'model__C': 10, 'model__solver': 'lbfgs'}\n",
|
|||
|
"Accuracy: 0.46111111111111114\n",
|
|||
|
"Precision: 0.475\n",
|
|||
|
"Recall: 0.2\n",
|
|||
|
"F1-score: 0.2814814814814815\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAgwAAAHHCAYAAADTQQDlAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABPRElEQVR4nO3de1yO9/8H8Ndd6i7V3QGVqKSGmpzHEnJIOY7FjDFFmC2nzKlt5jiZzRjL2XIYY3KYwxdziiF+znNIUw4ZClkldLw/vz98u79ulbu7ruquXs89rsfcn+u6P5/3dXd3974/h+uSCSEEiIiIiN5Ar6wDICIiIt3HhIGIiIg0YsJAREREGjFhICIiIo2YMBAREZFGTBiIiIhIIyYMREREpBETBiIiItKICQMRERFpxIShkrpx4wZ8fHxgbm4OmUyGHTt2SFr/7du3IZPJsGbNGknrLc/at2+P9u3bS1ZfWloahg0bBltbW8hkMowbN06yunVFZGQkZDIZIiMjJalvzZo1kMlkuH37tiT1ETB9+nTIZLKyDoNKAROGMhQXF4dPPvkEdevWhZGRERQKBTw9PfHjjz/ixYsXJdq2v78/Ll++jG+++Qbr169HixYtSrS90hQQEACZTAaFQpHv63jjxg3IZDLIZDJ8//33Wtd///59TJ8+HRcvXpQg2qKbM2cO1qxZg08//RTr16/Hxx9/XKLt1alTBz169CjRNqQyZ84cyZPg1+UmH7lblSpVUKtWLQQEBODevXsl2jZRmRBUJnbv3i2MjY2FhYWFGDNmjFixYoX46aefRP/+/YWBgYEYPnx4ibX9/PlzAUB8+eWXJdaGUqkUL168ENnZ2SXWRkH8/f1FlSpVhL6+vti8eXOe/dOmTRNGRkYCgPjuu++0rv/MmTMCgAgPD9fqeRkZGSIjI0Pr9grSqlUr4enpKVl9mjg6Ooru3buXWntCCJGTkyNevHghcnJytHqeiYmJ8Pf3z1OenZ0tXrx4IZRKZbFjCw8PFwDEzJkzxfr168XKlStFYGCg0NfXF87OzuLFixfFbqM8yMrKqjTnWtlVKdt0pXK6desW+vfvD0dHRxw+fBg1a9ZU7QsKCkJsbCz27NlTYu0/evQIAGBhYVFibchkMhgZGZVY/ZrI5XJ4enri119/Rb9+/dT2bdy4Ed27d8fWrVtLJZbnz5+jatWqMDQ0lLTehw8fws3NTbL6srOzoVQqJY+zOPT09CR9H+nr60NfX1+y+gCga9euqh66YcOGoXr16vj222+xc+fOPO+9kiSEQHp6OoyNjUutTQCoUqUKqlThn5LKgEMSZWDevHlIS0vD6tWr1ZKFXC4uLhg7dqzqcXZ2NmbNmgVnZ2fI5XLUqVMHX3zxBTIyMtSel9tlfPz4cbRs2RJGRkaoW7cu1q1bpzpm+vTpcHR0BABMnDgRMpkMderUAfCyKz/336/Kb4zywIEDaNOmDSwsLGBqaor69evjiy++UO0vaA7D4cOH0bZtW5iYmMDCwgK9evVCdHR0vu3FxsYiICAAFhYWMDc3x5AhQ/D8+fOCX9jXfPTRR9i7dy+Sk5NVZWfOnMGNGzfw0Ucf5Tn+yZMnmDBhAtzd3WFqagqFQoGuXbvi0qVLqmMiIyPxzjvvAACGDBmi6o7OPc/27dujYcOGOHfuHNq1a4eqVauqXpfX5zD4+/vDyMgoz/n7+vrC0tIS9+/fz/e8csf1b926hT179qhiyB2Xf/jwIQIDA2FjYwMjIyM0btwYa9euVasj9+fz/fffY+HChar31rVr1wr12haksO9VpVKJ6dOnw87ODlWrVkWHDh1w7do11KlTBwEBAXnO9dU5DDdu3ECfPn1ga2sLIyMj1K5dG/3790dKSgqAl8nqs2fPsHbtWtVrk1tnQXMY9u7dCy8vL5iZmUGhUOCdd97Bxo0bi/QatG3bFsDLIcdXXb9+HX379oWVlRWMjIzQokUL7Ny5M8/z//rrL3h5ecHY2Bi1a9fG7NmzER4enifu3N/3/fv3o0WLFjA2Nsby5csBAMnJyRg3bhzs7e0hl8vh4uKCb7/9FkqlUq2tTZs2oXnz5qrzdnd3x48//qjan5WVhRkzZuCtt96CkZERqlWrhjZt2uDAgQOqY/L7fJDyM4t0B9PCMrBr1y7UrVsXrVu3LtTxw4YNw9q1a9G3b198/vnnOH36NEJDQxEdHY3t27erHRsbG4u+ffsiMDAQ/v7++PnnnxEQEIDmzZvj7bffhp+fHywsLBAcHIwBAwagW7duMDU11Sr+q1evokePHmjUqBFmzpwJuVyO2NhYnDhx4o3PO3jwILp27Yq6deti+vTpePHiBRYvXgxPT0+cP38+T7LSr18/ODk5ITQ0FOfPn8eqVatgbW2Nb7/9tlBx+vn5YeTIkdi2bRuGDh0K4GXvQoMGDdCsWbM8x9+8eRM7duzABx98ACcnJyQmJmL58uXw8vLCtWvXYGdnB1dXV8ycORNff/01RowYofrj8OrPMikpCV27dkX//v0xaNAg2NjY5Bvfjz/+iMOHD8Pf3x9RUVHQ19fH8uXL8ccff2D9+vWws7PL93murq5Yv349goODUbt2bXz++ecAgBo1auDFixdo3749YmNjMWrUKDg5OWHLli0ICAhAcnKyWiIKAOHh4UhPT8eIESMgl8thZWVVqNe2IIV9r4aEhGDevHno2bMnfH19cenSJfj6+iI9Pf2N9WdmZsLX1xcZGRkYPXo0bG1tce/ePezevRvJyckwNzfH+vXrMWzYMLRs2RIjRowAADg7OxdY55o1azB06FC8/fbbCAkJgYWFBS5cuIB9+/blm1hqkvtH3dLSUlV29epVeHp6olatWpgyZQpMTEzw22+/oXfv3ti6dSvef/99AMC9e/fQoUMHyGQyhISEwMTEBKtWrYJcLs+3rZiYGAwYMACffPIJhg8fjvr16+P58+fw8vLCvXv38Mknn8DBwQEnT55ESEgIHjx4gIULFwJ4mfQPGDAAnTp1Uv1ORUdH48SJE6r3yfTp0xEaGqp6PVNTU3H27FmcP38enTt3LvA1kPIzi3RIWY+JVDYpKSkCgOjVq1ehjr948aIAIIYNG6ZWPmHCBAFAHD58WFXm6OgoAIhjx46pyh4+fCjkcrn4/PPPVWW3bt3Kd/ze399fODo65olh2rRp4tW3yoIFCwQA8ejRowLjzm3j1XH+Jk2aCGtra5GUlKQqu3TpktDT0xODBw/O097QoUPV6nz//fdFtWrVCmzz1fMwMTERQgjRt29f0alTJyHEy/FwW1tbMWPGjHxfg/T09Dxj5bdu3RJyuVzMnDlTVfamOQxeXl4CgFi2bFm++7y8vNTK9u/fLwCI2bNni5s3bwpTU1PRu3dvjecoRP5zChYuXCgAiF9++UVVlpmZKTw8PISpqalITU1VnRcAoVAoxMOHD4vc3qsK+15NSEgQVapUyXOe06dPFwDU5h4cOXJEABBHjhwRQghx4cIFAUBs2bLljbEWNIchd97BrVu3hBBCJCcnCzMzM9GqVas84/Ca5jnk1nXw4EHx6NEjcffuXRERESFq1Kgh5HK5uHv3rurYTp06CXd3d5Genq5Wf+vWrcVbb72lKhs9erSQyWTiwoULqrKkpCRhZWWlFrcQ//t937dvn1pcs2bNEiYmJuLvv/9WK58yZYrQ19cX8fHxQgghxo4dKxQKxRvnGTVu3FjjvJXXPx9K4jOLdAOHJEpZamoqAMDMzKxQx//nP/8BAIwfP16tPPdb5etzHdzc3FTfeoGX3zrr16+PmzdvFjnm1+XOffj999/zdHEW5MGDB7h48SICAgLUvsU2atQInTt3Vp3nq0aOHKn2uG3btkhKSlK9hoXx0UcfITIyEgkJCTh8+DASEhIK/NYol8uhp/fyVyInJwdJSUmq4Zbz588Xuk25XI4hQ4YU6lgfHx988sknmDlzJvz8/GBkZKTqVi6K//znP7C1tcWAAQNUZQYGBhgzZgzS0tJw9OhRteP79OmDGjVqFLm919sGNL9XDx06hOzsbHz22Wdqx40ePVpjG+bm5gCA/fv3azU8VZADBw7
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 2 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Model: Random Forest Classification\n",
|
|||
|
"Best Parameters: {'model__max_depth': 10, 'model__n_estimators': 200}\n",
|
|||
|
"Accuracy: 0.4888888888888889\n",
|
|||
|
"Precision: 0.5211267605633803\n",
|
|||
|
"Recall: 0.3894736842105263\n",
|
|||
|
"F1-score: 0.4457831325301205\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAgsAAAHHCAYAAAAxnRucAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABQyUlEQVR4nO3dd1xT1/sH8E9AE2ZABBmKUXCiuKu1uEVw1FG01g2KWlsnLvTbWhWr+NVardZaV13Vauv8aq17z7pwFVFwi6KigIjMnN8flvyMQEzkIiF+3n3dV8m5N+c+N1zDk+eceyMTQggQERER5cGssAMgIiIi48ZkgYiIiHRiskBEREQ6MVkgIiIinZgsEBERkU5MFoiIiEgnJgtERESkE5MFIiIi0onJAhEREenEZKGAXLt2DX5+frCzs4NMJsPmzZsl7f/mzZuQyWRYvny5pP0WZc2aNUOzZs0k6y85ORn9+/eHi4sLZDIZRowYIVnfRQXPM+NmDL+fcuXKISgoSKstt/e/5cuXQyaT4ebNm+88RplMhkmTJr3z/ZoSk04WYmJi8Pnnn8PDwwMWFhZQKpXw8fHBDz/8gBcvXhTovgMDA3Hx4kVMnToVq1atQr169Qp0f+9SUFAQZDIZlEplrq/jtWvXIJPJIJPJ8N133xncf2xsLCZNmoSIiAgJon1706ZNw/Lly/HFF19g1apV6N27d4Hur1y5cprXTSaTwdraGvXr18fKlSsLdL9Fzeuv06tLampqYYeXw7FjxzBp0iQkJCQY9LwDBw4gICAALi4ukMvlKFWqFNq3b4+NGzcWTKASKoz3v+3btzMhKEDFCjuAgvLnn3/i008/hUKhQJ8+fVC9enWkp6fjyJEjGDNmDC5fvoxFixYVyL5fvHiB48eP46uvvsKQIUMKZB8qlQovXrxA8eLFC6T/NylWrBhSUlKwdetWdO3aVWvd6tWrYWFh8dZv3LGxsZg8eTLKlSuHWrVq6f28Xbt2vdX+8rJv3z58+OGHmDhxoqT96lKrVi2MGjUKAHD//n0sWbIEgYGBSEtLw4ABA95ZHMbu1dfpVXK5vBCi0e3YsWOYPHkygoKCYG9vr9dzJk6ciLCwMFSsWBGff/45VCoV4uPjsX37dnTu3BmrV69Gjx49CjZwPUVFRcHM7P8/d+b1/te7d29069YNCoWiQOLYvn075s+fn2vC8OLFCxQrZrJ/7t4Jk3z1bty4gW7dukGlUmHfvn1wdXXVrBs8eDCio6Px559/Ftj+Hz16BAB6vzG8DZlMBgsLiwLr/00UCgV8fHzw22+/5UgW1qxZg3bt2mHDhg3vJJaUlBRYWVlJ/ofi4cOH8PLykqy/zMxMqNVqnXGWLl0avXr10jwOCgqCh4cHZs+ezWThFa+/TlJRq9VIT08v1H9b69evR1hYGLp06YI1a9ZofSAYM2YMdu7ciYyMjEKL73Wv//HP6/3P3Nwc5ubm7yosLYX5+zQZwgQNGjRIABBHjx7Va/uMjAwRFhYmPDw8hFwuFyqVSowfP16kpqZqbadSqUS7du3E4cOHxQcffCAUCoUoX768WLFihWabiRMnCgBai0qlEkIIERgYqPn5VdnPedWuXbuEj4+PsLOzE9bW1qJSpUpi/PjxmvU3btwQAMSyZcu0nrd3717RqFEjYWVlJezs7ESHDh3EP//8k+v+rl27JgIDA4WdnZ1QKpUiKChIPH/+/I2vV2BgoLC2thbLly8XCoVCPH36VLPu77//FgDEhg0bBAAxc+ZMzbr4+HgxatQoUb16dWFtbS1sbW1F69atRUREhGab/fv353j9Xj3Opk2bimrVqonTp0+Lxo0bC0tLSzF8+HDNuqZNm2r66tOnj1AoFDmO38/PT9jb24t79+7lenx5xXDjxg0hhBBxcXGiX79+olSpUkKhUIgaNWqI5cuXa/WR/fuZOXOmmD17tvDw8BBmZmbi3Llzeb6u2efX6+rVqyfkcrlW26FDh0SXLl2Eu7u7kMvlokyZMmLEiBEiJSVFa7vs39Xdu3dFx44dhbW1tXB0dBSjRo0SmZmZWts+ffpUBAYGCqVSKezs7ESfPn3EuXPn8n2eRUVFiZ49ewqlUikcHR3F119/LdRqtbh9+7bo0KGDsLW1Fc7OzuK7777L87XR53V6VXJyshg5cqQoU6aMkMvlolKlSmLmzJlCrVZrbQdADB48WPz666/Cy8tLFCtWTGzatEkIIcTdu3dF3759RalSpYRcLhdeXl5i6dKlOfY1d+5c4eXlJSwtLYW9vb2oW7euWL16tdZrkNe5lJsqVaoIBwcHkZSU9MbXIrf3gfPnz4vAwEBRvnx5oVAohLOzs+jbt694/Pix1nOTkpLE8OHDhUqlEnK5XDg5OQlfX19x5swZzTZXr14VAQEBwtnZWSgUClG6dGnx2WefiYSEBM02KpVKBAYG5nm82e95y5Yty/XYt2/fLpo0aSJsbGyEra2tqFevnub1E0K/cz0wMDDX1zkbADFx4kSt/Z49e1a0bt1a2NraCmtra9GiRQtx/PhxrW2yYz5y5IgICQkRjo6OwsrKSnTq1Ek8fPjwjb8fU2KSlYWtW7fCw8MDH330kV7b9+/fHytWrECXLl0watQonDx5EuHh4YiMjMSmTZu0to2OjkaXLl0QHByMwMBA/PLLLwgKCkLdunVRrVo1BAQEwN7eHiEhIejevTvatm0LGxsbg+K/fPkyPv74Y9SoUQNhYWFQKBSIjo7G0aNHdT5vz549aNOmDTw8PDBp0iS8ePEC8+bNg4+PD86ePYty5cppbd+1a1eUL18e4eHhOHv2LJYsWYJSpUrhv//9r15xBgQEYNCgQdi4cSP69esH4GVVoUqVKqhTp06O7a9fv47Nmzfj008/Rfny5REXF4eFCxeiadOm+Oeff+Dm5oaqVasiLCwM33zzDQYOHIjGjRsDgNbvMj4+Hm3atEG3bt3Qq1cvODs75xrfDz/8gH379iEwMBDHjx+Hubk5Fi5ciF27dmHVqlVwc3PL9XlVq1bFqlWrEBISgjJlymjK3U5OTnjx4gWaNWuG6OhoDBkyBOXLl8cff/yBoKAgJCQkYPjw4Vp9LVu2DKmpqRg4cCAUCgUcHBz0em2zZWZm4u7duyhRooRW+x9//IGUlBR88cUXKFmyJP7++2/MmzcPd+/exR9//KG1bVZWFvz9/dGgQQN899132LNnD2bNmgVPT0988cUXAAAhBDp27IgjR45g0KBBqFq1KjZt2oTAwMAcMRl6nn322WeoWrUqpk+fjj///BPffvstHBwcsHDhQrRo0QL//e9/sXr1aowePRoffPABmjRp8sbXJSMjA48fP9Zqs7KygpWVFYQQ6NChA/bv34/g4GDUqlULO3fuxJgxY3Dv3j3Mnj1b63n79u3D77//jiFDhsDR0RHlypVDXFwcPvzwQ8hkMgwZMgROTk7466+/EBwcjKSkJM1k18WLF2PYsGHo0qULhg8fjtTUVFy4cAEnT55Ejx49EBAQgKtXr+K3337D7Nmz4ejoCODluZSba9eu4cqVK+jXrx9sbW3f+DrkZvfu3bh+/Tr69u0LFxcXzZDr5cuXceLECchkMgDAoEGDsH79egwZMgReXl6Ij4/HkSNHEBkZiTp16iA9PR3+/v5IS0vD0KFD4eLignv37mHbtm1ISEiAnZ1djn0b+v63fPly9OvXD9WqVcP48eNhb2+Pc+fOYceOHZphFn3O9c8//xyxsbHYvXs3Vq1a9cbX6PLly2jcuDGUSiXGjh2L4sWLY+HChWjWrBkOHjyIBg0aaG0/dOhQlChRAhMnTsTNmzcxZ84cDBkyBOvWrdP791LkFXa2IrXExEQBQHTs2FGv7SMiIgQA0b9/f6320aNHCwBi3759mjaVSiUAiEOHDmnaHj58KBQKhRg1apSm7dVPla/St7Iwe/ZsAUA8evQoz7hz+0RRq1YtUapUKREfH69pO3/+vDAzMxN9+vTJsb9+/fpp9fnJJ5+IkiVL5rnPV4/D2tpaCCFEly5dRMuWLYUQQmRlZQkXFxcxefLkXF+D1NRUkZWVleM4FAqFCAs
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 2 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Model: Gradient Boosting Classification\n",
|
|||
|
"Best Parameters: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 100}\n",
|
|||
|
"Accuracy: 0.4722222222222222\n",
|
|||
|
"Precision: 0.5\n",
|
|||
|
"Recall: 0.42105263157894735\n",
|
|||
|
"F1-score: 0.45714285714285713\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAgwAAAHHCAYAAADTQQDlAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABYGUlEQVR4nO3deVxN+f8H8Ndtu6W6pZREUlmSbDEM2aU0xli/RmOpMIaxNpaYGSPbZAzDDAZjq7EMxjbDGGTJPr72wWAqESKEFmn//P7w7f5ct9xuneqW19PjPHQ/59zPeZ97b6f3/SznyIQQAkRERERvoFfWARAREZHuY8JAREREGjFhICIiIo2YMBAREZFGTBiIiIhIIyYMREREpBETBiIiItKICQMRERFpxISBiIiINGLCUAhRUVHw9vaGhYUFZDIZdu7cKWn9t27dgkwmQ1hYmKT1lmcdOnRAhw4dJKsvNTUVw4YNg52dHWQyGcaPHy9Z3bouv89XSEgIZDJZ2QVVwZSX11MXzjW1atVCQECASll+59iwsDDIZDLcunWr1GOUyWQICQkp9f3qunKTMMTExOCTTz6Bs7MzjI2NoVAo4Onpie+//x4vXrwo0X37+/vj8uXLmDNnDtatW4fmzZuX6P5KU0BAAGQyGRQKRb6vY1RUFGQyGWQyGebPn691/fHx8QgJCcHFixcliLbovv76a4SFhWHkyJFYt24dBg0aVOL7zM3Nxc8//4wuXbqgSpUqMDQ0hK2tLby9vfHTTz8hIyOjxGMoS9q+93l/IF5dbG1t0bFjR/z5558lG2whpKWlISQkBJGRkWUdSr4iIyPRu3dv2NnZwcjICLa2tujevTu2b99e1qFpVBbn2D179jAp0JYoB3bv3i1MTEyEpaWlGDt2rPjpp5/EkiVLRP/+/YWhoaH4+OOPS2zfaWlpAoD44osvSmwfubm54sWLFyI7O7vE9lEQf39/YWBgIPT19cXmzZvV1k+fPl0YGxsLAOLbb7/Vuv4zZ84IAGLt2rVaPS8jI0NkZGRovb+CtGzZUnh6ekpWnyZpaWnCx8dHABCtW7cWoaGhYs2aNWL+/Pmie/fuQl9fXwwZMqRUYomNjVV7D7KyssSLFy9KdL/avvdr164VAMTMmTPFunXrxM8//yy+/fZb0aBBAwFA7Nq1q0Tj1eTRo0cCgJg+fbrautJ4Pd/kq6++EgBEnTp1xFdffSVWr14t5s2bJzp06CAAiA0bNggh8v8slLb09HSRmZmpfFzQOTY7O1u8ePFC5Obmlkgco0aNEgX9CXzx4oXIysoqkf2WZwZlkaRoIzY2Fv3794ejoyMOHTqEatWqKdeNGjUK0dHR+OOPP0ps/48ePQIAWFpaltg+ZDIZjI2NS6x+TeRyOTw9PfHLL7+gX79+Kus2btyIbt26Ydu2baUSS1paGipVqgQjIyNJ63348CHc3Nwkqy87Oxu5ubkFxhkUFIR9+/Zh0aJFGDdunMq6CRMmICoqChEREcXaR3EYGBjAwEA3f/19fX1VvmEOHToUVatWxS+//IL333+/DCMrWFm+nlu3bsXMmTPRt29fbNy4EYaGhsp1kyZNwr59+5CVlVUmseVHLperPC7oHKuvrw99ff3SCktFWZ6PdVpZZyyajBgxQgAQJ06cKNT2WVlZYubMmcLZ2VkYGRkJR0dHMXXqVJGenq6ynaOjo+jWrZs4duyYeOedd4RcLhdOTk4iPDxcuc306dMFAJXF0dFRCPHym3nez6/Ke86r9u/fLzw9PYWFhYUwNTUVdevWFVOnTlWuLyjrP3jwoGjTpo2oVKmSsLCwEB988IH4559/8t1fVFSU8Pf3FxYWFkKhUIiAgADx/Plzja+Xv7+/MDU1FWFhYUIul4unT58q1/33v/8VAMS2bdvUWhgSExPFhAkThLu7uzA1NRXm5uaia9eu4uLFi8ptDh8+rPb6vXqc7du3Fw0aNBBnz54Vbdu2FSYmJmLcuHHKde3bt1fWNXjwYCGXy9WO39vbW1haWop79+7le3wFxRAbGyuEECIhIUEMGTJE2NraCrlcLho1aiTCwsJU6sh7f7799luxcOFC4ezsLPT09MSFCxfy3WdcXJzQ19cXXbt2fcMrr+pN+8jIyBDTpk0THh4eQqFQiEqVKok2bdqIQ4cOqdXz9OlT4e/vLxQKhbCwsBCDBw8WFy5cUPt85fc5FUKIdevWCQ8PD2FsbCwqV64sPvzwQxEXF6eyTd77dvXqVdGhQwdhYmIi7O3txTfffKPcRtN7n5+8FoYzZ86olOfm5gqFQiEGDx6sUp6amio+++wzUaNGDWFkZCTq1q0rvv32W7VvpIU9J5w5c0Z4e3sLa2trYWxsLGrVqiUCAwNV3p/Xl7zWhvxeTwBi1KhRYseOHaJBgwbCyMhIuLm5iT///FPt2A8fPiyaNWsm5HK5cHZ2FsuXLy/wPXqdq6ursLKyEsnJyRq3ze9cc+nSJeHv7y+cnJyEXC4XVatWFYGBgeLx48cqz01OThbjxo0Tjo6OwsjISNjY2AgvLy9x7tw55Tb//vuv6N27t6hataqQy+WievXq4sMPPxTPnj1TbuPo6Cj8/f1VXrf8zrF5n4e839U8e/bsEe3atRNmZmbC3NxcNG/eXNmCIoQQR48eFX379hUODg7CyMhI1KhRQ4wfP16kpaUpt/H398/3/czz6nub5/z586Jr167C3NxcmJqaik6dOolTp06pbJMX8/Hjx0VQUJCoUqWKqFSpkujZs6d4+PChxvdH1+nmV4xX7Nq1C87OzmjdunWhth82bBjCw8PRt29fTJgwAadPn0ZoaCiuXbuGHTt2qGwbHR2Nvn37YujQofD398eaNWsQEBCAZs2aoUGDBujduzcsLS0RFBQEPz8/vPfeezAzM9Mq/qtXr+L9999Ho0aNMHPmTMjlckRHR+PEiRNvfN6BAwfg6+sLZ2dnhISE4MWLF1i8eDE8PT1x/vx51KpVS2X7fv36wcnJCaGhoTh//jxWrVoFW1tbfPPNN4WKs3fv3hgxYgS2b9+OIUOGAHjZuuDq6goPDw+17W/evImdO3fiP//5D5ycnJCQkIAVK1agffv2+Oeff2Bvb4/69etj5syZ+OqrrzB8+HC0bdsWAFTey8TERPj6+qJ///4YOHAgqlatmm9833//PQ4dOgR/f3+cOnUK+vr6WLFiBfbv349169bB3t4+3+fVr18f69atQ1BQEGrUqIEJEyYAAGxsbPDixQt06NAB0dHRGD16NJycnPDrr78iICAAz549U2sZWLt2LdLT0zF8+HDI5XJYWVnlu88///wTOTk5GDhwoIZXXV1++0hOTsaqVavg5+eHjz/+GCkpKVi9ejV8fHzw3//+F02aNAEACCHQo0cPHD9+HCNGjED9+vWxY8cO+Pv7F2rfc+bMwbRp09CvXz8MGzYMjx49wuLFi9GuXTtcuHBB5Rvg06dP0bVrV/Tu3Rv9+vXD1q1bERwcjIYNG8LX17dQ731BkpKS8PjxYwgh8PDhQyxevBipqakqr6cQAh988AEOHz6MoUOHokmTJti3bx8mTZqEe/fuYeHChcptC3NOePjwIby9vWFjY4MpU6bA0tISt27dUvb/29jYYNmyZRg5ciR69eqF3r17AwAaNWr0xmM5fvw4tm/fjk8//RTm5ub44Ycf0KdPH8TFxcHa2hoAcOHCBXTt2hXVqlXDjBkzkJOTg5kzZ8LGxkbjaxUVFYXr169jyJAhMDc317h9fiIiInDz5k0EBgbCzs4OV69exU8//YSrV6/ir7/+Ug7mHDFiBLZu3YrRo0fDzc0NiYmJOH78OK5duwYPDw9kZmbCx8cHGRkZGDNmDOzs7HDv3j3s3r0bz549g4WFhdq+tT3HhoWFYciQIWjQoAGmTp0KS0tLXLhwAXv37sVHH30EAPj111+RlpaGkSNHwtraGv/973+xePFi3L17F7/++isA4JNPPkF8fDwiIiKwbt06ja/R1atX0bZtWygUCkyePBmGhoZYsWIFOnTogCNHjqBly5Yq248ZMwaVK1fG9OnTcevWLSxatAijR4/G5s2
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 2 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"from sklearn.model_selection import train_test_split, GridSearchCV\n",
|
|||
|
"from sklearn.preprocessing import StandardScaler\n",
|
|||
|
"from sklearn.linear_model import LinearRegression, LogisticRegression\n",
|
|||
|
"from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n",
|
|||
|
"from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier\n",
|
|||
|
"from sklearn.pipeline import Pipeline\n",
|
|||
|
"from sklearn.compose import ColumnTransformer\n",
|
|||
|
"from sklearn.preprocessing import OneHotEncoder\n",
|
|||
|
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay\n",
|
|||
|
"\n",
|
|||
|
"# Загружаем набор данных\n",
|
|||
|
"df = pd.read_csv(\".//static//csv//Stores.csv\")\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"numerical_cols = [\"Store_Area\", \"Items_Available\", \"Store_Sales\"]\n",
|
|||
|
"\n",
|
|||
|
"# Создаем преобразователь для категориальных и числовых столбцов\n",
|
|||
|
"preprocessor = ColumnTransformer(\n",
|
|||
|
" transformers=[\n",
|
|||
|
" ('num', StandardScaler(), numerical_cols)\n",
|
|||
|
" ])\n",
|
|||
|
"\n",
|
|||
|
"# Список моделей и их гиперпараметров для задачи регрессии\n",
|
|||
|
"models_reg = {\n",
|
|||
|
" \"Linear Regression\": (LinearRegression(), {}),\n",
|
|||
|
" \"Random Forest Regression\": (RandomForestRegressor(), {\n",
|
|||
|
" 'model__n_estimators': [100, 200],\n",
|
|||
|
" 'model__max_depth': [None, 10, 20]\n",
|
|||
|
" }),\n",
|
|||
|
" \"Gradient Boosting Regression\": (GradientBoostingRegressor(), {\n",
|
|||
|
" 'model__n_estimators': [100, 200],\n",
|
|||
|
" 'model__learning_rate': [0.01, 0.1],\n",
|
|||
|
" 'model__max_depth': [3, 5]\n",
|
|||
|
" })\n",
|
|||
|
"}\n",
|
|||
|
"\n",
|
|||
|
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи регрессии\n",
|
|||
|
"X_reg = df[numerical_cols]\n",
|
|||
|
"y_reg = df['Daily_Customer_Count']\n",
|
|||
|
"\n",
|
|||
|
"# Разделяем данные на обучающую и тестовую выборки для задачи регрессии\n",
|
|||
|
"X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Обучаем и оцениваем модели для задачи регрессии\n",
|
|||
|
"print(\"Результаты для задачи регрессии:\")\n",
|
|||
|
"for name, (model, params) in models_reg.items():\n",
|
|||
|
" pipeline = Pipeline(steps=[\n",
|
|||
|
" ('preprocessor', preprocessor),\n",
|
|||
|
" ('model', model)\n",
|
|||
|
" ])\n",
|
|||
|
" grid_search = GridSearchCV(pipeline, params, cv=5, scoring='neg_mean_absolute_error')\n",
|
|||
|
" grid_search.fit(X_train_reg, y_train_reg)\n",
|
|||
|
" best_model = grid_search.best_estimator_\n",
|
|||
|
" y_pred_reg = best_model.predict(X_test_reg)\n",
|
|||
|
" mae = mean_absolute_error(y_test_reg, y_pred_reg)\n",
|
|||
|
" mse = mean_squared_error(y_test_reg, y_pred_reg)\n",
|
|||
|
" rmse = mean_squared_error(y_test_reg, y_pred_reg, squared=False)\n",
|
|||
|
" r2 = r2_score(y_test_reg, y_pred_reg)\n",
|
|||
|
" print(f\"Model: {name}\")\n",
|
|||
|
" print(f\"Best Parameters: {grid_search.best_params_}\")\n",
|
|||
|
" print(f\"MAE: {mae}\")\n",
|
|||
|
" print(f\"MSE: {mse}\")\n",
|
|||
|
" print(f\"RMSE: {rmse}\")\n",
|
|||
|
" print(f\"R²: {r2}\")\n",
|
|||
|
" print()\n",
|
|||
|
"\n",
|
|||
|
"# Список моделей и их гиперпараметров для задачи классификации\n",
|
|||
|
"models_class = {\n",
|
|||
|
" \"Logistic Regression\": (LogisticRegression(), {\n",
|
|||
|
" 'model__C': [0.1, 1, 10],\n",
|
|||
|
" 'model__solver': ['liblinear', 'lbfgs']\n",
|
|||
|
" }),\n",
|
|||
|
" \"Random Forest Classification\": (RandomForestClassifier(), {\n",
|
|||
|
" 'model__n_estimators': [100, 200],\n",
|
|||
|
" 'model__max_depth': [None, 10, 20]\n",
|
|||
|
" }),\n",
|
|||
|
" \"Gradient Boosting Classification\": (GradientBoostingClassifier(), {\n",
|
|||
|
" 'model__n_estimators': [100, 200],\n",
|
|||
|
" 'model__learning_rate': [0.01, 0.1],\n",
|
|||
|
" 'model__max_depth': [3, 5]\n",
|
|||
|
" })\n",
|
|||
|
"}\n",
|
|||
|
"\n",
|
|||
|
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи классификации\n",
|
|||
|
"X_class = df[numerical_cols]\n",
|
|||
|
"y_class = (df['Daily_Customer_Count'] > df['Daily_Customer_Count'].mean()).astype(int)\n",
|
|||
|
"\n",
|
|||
|
"# Разделяем данные на обучающую и тестовую выборки для задачи классификации\n",
|
|||
|
"X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Обучаем и оцениваем модели для задачи классификации\n",
|
|||
|
"print(\"Результаты для задачи классификации:\")\n",
|
|||
|
"for name, (model, params) in models_class.items():\n",
|
|||
|
" pipeline = Pipeline(steps=[\n",
|
|||
|
" ('preprocessor', preprocessor),\n",
|
|||
|
" ('model', model)\n",
|
|||
|
" ])\n",
|
|||
|
" grid_search = GridSearchCV(pipeline, params, cv=5, scoring='accuracy')\n",
|
|||
|
" grid_search.fit(X_train_class, y_train_class)\n",
|
|||
|
" best_model = grid_search.best_estimator_\n",
|
|||
|
" y_pred_class = best_model.predict(X_test_class)\n",
|
|||
|
" accuracy = accuracy_score(y_test_class, y_pred_class)\n",
|
|||
|
" precision = precision_score(y_test_class, y_pred_class)\n",
|
|||
|
" recall = recall_score(y_test_class, y_pred_class)\n",
|
|||
|
" f1 = f1_score(y_test_class, y_pred_class)\n",
|
|||
|
" print(f\"Model: {name}\")\n",
|
|||
|
" print(f\"Best Parameters: {grid_search.best_params_}\")\n",
|
|||
|
" print(f\"Accuracy: {accuracy}\")\n",
|
|||
|
" print(f\"Precision: {precision}\")\n",
|
|||
|
" print(f\"Recall: {recall}\")\n",
|
|||
|
" print(f\"F1-score: {f1}\")\n",
|
|||
|
" print()\n",
|
|||
|
"\n",
|
|||
|
" # Визуализация матрицы ошибок\n",
|
|||
|
" cm = confusion_matrix(y_test_class, y_pred_class)\n",
|
|||
|
" disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Less', 'More'])\n",
|
|||
|
" disp.plot(cmap=plt.cm.Blues)\n",
|
|||
|
" plt.title(f'Confusion Matrix for {name}')\n",
|
|||
|
" plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Давайте проанализируем полученные значения метрик и определим, являются ли они нормальными или их можно улучшить.\n",
|
|||
|
"\n",
|
|||
|
"### Оценка смещения и дисперсии для задачи регрессии:\n",
|
|||
|
"\n",
|
|||
|
"### Вывод для задачи регрессии:\n",
|
|||
|
"\n",
|
|||
|
"- **Random Forest Regression** демонстрирует наилучшие результаты по метрикам MAE и R², что указывает на высокую точность и стабильность модели.\n",
|
|||
|
"- **Linear Regression** и **Gradient Boosting Regression** также показывают хорошие результаты, но уступают случайному лесу.\n",
|
|||
|
"\n",
|
|||
|
"### Вывод для задачи классификации:\n",
|
|||
|
"\n",
|
|||
|
"- **Random Forest Classification** демонстрирует наилучшие результаты по всем метрикам (Accuracy, Precision, Recall, F1-score), что указывает на высокую точность и стабильность модели.\n",
|
|||
|
"- **Logistic Regression** и **Gradient Boosting Classification** также показывают хорошие результаты, но уступают случайному лесу.\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Для оценки смещения (bias) и дисперсии (variance) моделей можно использовать метод перекрестной проверки (cross-validation). Этот метод позволяет оценить, насколько хорошо модель обобщается на новых данных.\n",
|
|||
|
"\n",
|
|||
|
"Оценка смещения и дисперсии для задачи регрессии:\n",
|
|||
|
"Для задачи регрессии мы будем использовать метрики MAE (Mean Absolute Error) и R² (R-squared) для оценки смещения и дисперсии.\n",
|
|||
|
"\n",
|
|||
|
"Оценка смещения и дисперсии для задачи классификации:\n",
|
|||
|
"Для задачи классификации мы будем использовать метрики Accuracy, Precision, Recall и F1-score для оценки смещения и дисперсии.\n",
|
|||
|
"\n",
|
|||
|
"Пример кода для оценки смещения и дисперсии:"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 77,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Оценка смещения и дисперсии для задачи регрессии:\n",
|
|||
|
"Model: Linear Regression\n",
|
|||
|
"MAE (Cross-Validation): Mean = 214.80552977981765, Std = 10.606512171542404\n",
|
|||
|
"R² (Cross-Validation): Mean = -0.013983192308878256, Std = 0.013712813782736416\n",
|
|||
|
"\n",
|
|||
|
"Model: Random Forest Regression\n",
|
|||
|
"MAE (Cross-Validation): Mean = 228.04118684047177, Std = 8.752812633688961\n",
|
|||
|
"R² (Cross-Validation): Mean = -0.16773777274246124, Std = 0.07089525362334798\n",
|
|||
|
"\n",
|
|||
|
"Model: Gradient Boosting Regression\n",
|
|||
|
"MAE (Cross-Validation): Mean = 223.01691070195233, Std = 7.525579341977898\n",
|
|||
|
"R² (Cross-Validation): Mean = -0.1007213971850566, Std = 0.039722456407795335\n",
|
|||
|
"\n",
|
|||
|
"Оценка смещения и дисперсии для задачи классификации:\n",
|
|||
|
"Model: Logistic Regression\n",
|
|||
|
"Accuracy (Cross-Validation): Mean = 0.5055307262569833, Std = 0.03499561917769727\n",
|
|||
|
"Precision (Cross-Validation): Mean = 0.5065468552510806, Std = 0.054654647753909255\n",
|
|||
|
"Recall (Cross-Validation): Mean = 0.36069969356486214, Std = 0.041986149284426406\n",
|
|||
|
"F1-score (Cross-Validation): Mean = 0.41699563277139867, Std = 0.022647838103859376\n",
|
|||
|
"\n",
|
|||
|
"Model: Random Forest Classification\n",
|
|||
|
"Accuracy (Cross-Validation): Mean = 0.47995654872749843, Std = 0.02347679112801281\n",
|
|||
|
"Precision (Cross-Validation): Mean = 0.4767585025913199, Std = 0.027370716762614142\n",
|
|||
|
"Recall (Cross-Validation): Mean = 0.44468845760980596, Std = 0.05499181588361489\n",
|
|||
|
"F1-score (Cross-Validation): Mean = 0.47024453023626417, Std = 0.04502303274822186\n",
|
|||
|
"\n",
|
|||
|
"Model: Gradient Boosting Classification\n",
|
|||
|
"Accuracy (Cross-Validation): Mean = 0.5178895096213532, Std = 0.027332603426564073\n",
|
|||
|
"Precision (Cross-Validation): Mean = 0.5084858601973745, Std = 0.022621295137266188\n",
|
|||
|
"Recall (Cross-Validation): Mean = 0.49668028600612874, Std = 0.04700469023993552\n",
|
|||
|
"F1-score (Cross-Validation): Mean = 0.5055891495803455, Std = 0.03687694960165771\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"from sklearn.model_selection import cross_val_score\n",
|
|||
|
"from sklearn.preprocessing import StandardScaler\n",
|
|||
|
"from sklearn.linear_model import LinearRegression, LogisticRegression\n",
|
|||
|
"from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n",
|
|||
|
"from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier\n",
|
|||
|
"from sklearn.pipeline import Pipeline\n",
|
|||
|
"from sklearn.compose import ColumnTransformer\n",
|
|||
|
"from sklearn.preprocessing import OneHotEncoder\n",
|
|||
|
"\n",
|
|||
|
"# Загружаем набор данных\n",
|
|||
|
"df = pd.read_csv(\".//static//csv//Stores.csv\")\n",
|
|||
|
"\n",
|
|||
|
"# Определяем категориальные и числовые столбцы\n",
|
|||
|
"\n",
|
|||
|
"numerical_cols = [\"Store_Area\", \"Items_Available\", \"Store_Sales\"]\n",
|
|||
|
"\n",
|
|||
|
"# Создаем преобразователь для категориальных и числовых столбцов\n",
|
|||
|
"preprocessor = ColumnTransformer(\n",
|
|||
|
" transformers=[\n",
|
|||
|
" ('num', StandardScaler(), numerical_cols)\n",
|
|||
|
" ])\n",
|
|||
|
"\n",
|
|||
|
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи регрессии\n",
|
|||
|
"X_reg = df[numerical_cols]\n",
|
|||
|
"y_reg = df['Daily_Customer_Count']\n",
|
|||
|
"\n",
|
|||
|
"# Список моделей для задачи регрессии\n",
|
|||
|
"models_reg = {\n",
|
|||
|
" \"Linear Regression\": LinearRegression(),\n",
|
|||
|
" \"Random Forest Regression\": RandomForestRegressor(),\n",
|
|||
|
" \"Gradient Boosting Regression\": GradientBoostingRegressor()\n",
|
|||
|
"}\n",
|
|||
|
"\n",
|
|||
|
"# Оценка смещения и дисперсии для задачи регрессии\n",
|
|||
|
"print(\"Оценка смещения и дисперсии для задачи регрессии:\")\n",
|
|||
|
"for name, model in models_reg.items():\n",
|
|||
|
" pipeline = Pipeline(steps=[\n",
|
|||
|
" ('preprocessor', preprocessor),\n",
|
|||
|
" ('model', model)\n",
|
|||
|
" ])\n",
|
|||
|
" mae_scores = -cross_val_score(pipeline, X_reg, y_reg, cv=5, scoring='neg_mean_absolute_error')\n",
|
|||
|
" r2_scores = cross_val_score(pipeline, X_reg, y_reg, cv=5, scoring='r2')\n",
|
|||
|
" print(f\"Model: {name}\")\n",
|
|||
|
" print(f\"MAE (Cross-Validation): Mean = {mae_scores.mean()}, Std = {mae_scores.std()}\")\n",
|
|||
|
" print(f\"R² (Cross-Validation): Mean = {r2_scores.mean()}, Std = {r2_scores.std()}\")\n",
|
|||
|
" print()\n",
|
|||
|
"\n",
|
|||
|
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи классификации\n",
|
|||
|
"X_class = df[numerical_cols]\n",
|
|||
|
"y_class = (df['Daily_Customer_Count'] > df['Daily_Customer_Count'].mean()).astype(int)\n",
|
|||
|
"\n",
|
|||
|
"# Список моделей для задачи классификации\n",
|
|||
|
"models_class = {\n",
|
|||
|
" \"Logistic Regression\": LogisticRegression(),\n",
|
|||
|
" \"Random Forest Classification\": RandomForestClassifier(),\n",
|
|||
|
" \"Gradient Boosting Classification\": GradientBoostingClassifier()\n",
|
|||
|
"}\n",
|
|||
|
"\n",
|
|||
|
"# Оценка смещения и дисперсии для задачи классификации\n",
|
|||
|
"print(\"Оценка смещения и дисперсии для задачи классификации:\")\n",
|
|||
|
"for name, model in models_class.items():\n",
|
|||
|
" pipeline = Pipeline(steps=[\n",
|
|||
|
" ('preprocessor', preprocessor),\n",
|
|||
|
" ('model', model)\n",
|
|||
|
" ])\n",
|
|||
|
" accuracy_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='accuracy')\n",
|
|||
|
" precision_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='precision')\n",
|
|||
|
" recall_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='recall')\n",
|
|||
|
" f1_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='f1')\n",
|
|||
|
" print(f\"Model: {name}\")\n",
|
|||
|
" print(f\"Accuracy (Cross-Validation): Mean = {accuracy_scores.mean()}, Std = {accuracy_scores.std()}\")\n",
|
|||
|
" print(f\"Precision (Cross-Validation): Mean = {precision_scores.mean()}, Std = {precision_scores.std()}\")\n",
|
|||
|
" print(f\"Recall (Cross-Validation): Mean = {recall_scores.mean()}, Std = {recall_scores.std()}\")\n",
|
|||
|
" print(f\"F1-score (Cross-Validation): Mean = {f1_scores.mean()}, Std = {f1_scores.std()}\")\n",
|
|||
|
" print()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAABJsAAAJOCAYAAAAQ1Aa7AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACJrklEQVR4nOzdeZxO5eP/8ffsq5kxmjXLMJI1RJjIEs0MIkWyFRKylSVKn8pMKZ9S+IQsLbQYLYpKspNIEmmx00hpBhHDYMxy/f7wm/vrNjPMcGbuGfN6Ph4e7vuc65xznfu+7nNf877PuY6TMcYIAAAAAAAAsICzoysAAAAAAACA6wdhEwAAAAAAACxD2AQAAAAAAADLEDYBAAAAAADAMoRNAAAAAAAAsAxhEwAAAAAAACxD2AQAAAAAAADLEDYBAAAAAADAMoRNAAAAAAAAsAxhE4qdPn36yNfXt0i3eeDAATk5OWnu3LlFut3r2ccff6zAwECdPn3a0VXJU5MmTTRmzJh8lz99+rQeeeQRhYaGysnJScOHDy+8ysHGyclJcXFxjq5GkbmW49HatWvl5OSktWvXWl4vALgeFeQ75ocffpC7u7v++OMPy+sxc+ZMVaxYUWlpafle5v3331f16tXl5uamgIAAy+uEnPr06aOIiAhHV6NItWzZUi1btryqZSMiItSnTx9L64OSg7CpGJs7d66cnJzk5OSk9evX55hvjFGFChXk5OSku+++2wE1LLjMzEyFh4fLyclJX3/9taOrY4kzZ84oLi6uUP64y37/c/v36KOPWr49q2RmZmrcuHEaNmyYXXAYEREhJycntWnTJtfl3nzzTdv+/fjjj7mWGTNmjJycnPTAAw/kOj/7D/W8/v33v/+1lX3yySc1ffp0JScn52u/XnrpJc2dO1eDBg3S+++/rwcffDBfy12t7Ncr+5+Pj48aNWqk9957r1C3iwvi4uLk5OQkZ2dn/fnnnznmp6SkyMvLS05OTho6dKgDaggA9v1FJycnubq66sYbb1SfPn106NAhR1fvuvKf//xH3bt3V6VKlWzTWrZsaff6e3l56ZZbbtGUKVOUlZVlt/yDDz6oZs2aqXHjxmrRooV27txpm9enTx+dP39es2bNyldddu3apT59+igyMlJvvvmmZs+ebc1O5iH7OzH7n5ubmyIiIvTYY4/pxIkThbpt2Pdvx48fn2uZnj17ysnJqch/tAfy4uroCuDKPD09lZCQoGbNmtlN/+abb/TXX3/Jw8PDQTUruNWrVyspKUkRERGaN2+e2rZt6+gqXbMzZ84oPj5ekq469b+cu+66Sw899FCO6dWqVbN8W1b58ssvtXv3bg0YMCDHPE9PT61Zs0bJyckKDQ21mzdv3jx5enrq3Llzua7XGKP58+crIiJCX375pU6dOqUyZcrkWrZ79+5q165djun169e3Pb7nnnvk5+enN954Q88///wV92v16tVq0qSJxo0bd8WyVqlXr55GjRolSUpKStJbb72l3r17Ky0tTf379y+yejjS2bNn5erquK8rDw8PzZ8/P8dZcJ999pmDagQAOT3//POqXLmyzp07p++//15z587V+vXr9dtvv8nT09PR1Svxtm3bppUrV+q7777LMa98+fKaMGGCJOmff/5RQkKCRowYoaNHj+rFF1+0lXv22Wdt/bfhw4dr8ODBWrNmjaQL/aPevXtr0qRJGjZsmJycnC5bn7Vr1yorK0v/+9//VLVqVat284pmzJghX19fpaamatWqVZo6daq2bt2a6w/j16M333wzR4hYlDw9PTV//nw988wzdtNTU1P1+eef81lHscKZTSVAu3bt9MknnygjI8NuekJCgho0aJDjD/bi7IMPPtCtt96qESNGaNGiRUpNTXV0lYq9atWqqVevXjn+NWrU6LLLnTlzJtfpGRkZOn/+/DXV6Urv25w5c9S0aVPdeOONOeY1bdpUvr6++uijj+ym//XXX/r222/Vvn37PNe7du1a/fXXX3rnnXeUkZFx2T/2b7311lxft1q1atnKODs7q0uXLnrvvfdkjLnsPknSkSNHLD1NPT/vxY033mir++jRo7V+/Xr5+vpq8uTJltUjvxz1efX09HRo2NSuXTvNnz8/x/SEhITLtlcAKEpt27ZVr1699Mgjj+itt97SE088of379+uLL75wdNUKRV79nMIyZ84cVaxYUU2aNMkxz9/f3/ZdPXz4cK1bt06VKlXS1KlTlZmZaSt38Q+Fxhg5O9v/Kda1a1f98ccftgDqco4cOSJJlvZL8vOadunSRb169dLAgQP18ccf64EHHtCGDRv0ww8/WFaP/MjKysrzx8nC5Obm5tAf+tu1a6cdO3bo559/tpv++eef6/z587rrrrscVDMgJ8KmEqB79+46duyYVqxYYZt2/vx5LViwQD169Mh1maysLE2ZMkW1atWSp6enQkJCNHDgQP3777925T7//HO1b99e4eHh8vDwUGRkpF544QW7L0bpwhk7tWvX1o4dO9SqVSt5e3vrxhtv1CuvvJLv/Th79qwWLlyobt26qWvXrjp79qw+//zzPMv//vvviomJkY+Pj8LDw/X888/nCAQ+/PBDNWjQQGXKlJGfn5/q1Kmj//3vfznWc//99yswMFDe3t5q0qSJvvrqqyvWN6/rky++VvvAgQMKCgqSJMXHx9tOb7342v9du3apS5cuCgwMlKenpxo2bGh5xy/7/dmyZYuaN28ub29vPf3007ZTbl999VVNmTJFkZGR8vDw0I4dOyRdOFPnjjvukI+PjwICAnTPPffYndIt/d9p0zt27FCPHj1UtmzZHGfZXezcuXNaunRpnpfKeXp66r777lNCQoLd9Pnz56ts2bKKiYnJc93z5s1TzZo11apVK7Vp00bz5s3L70uUp7vuukt//PGHtm3blmeZ7HFwEhMT9dVXX9ne5wMHDki60OHr16+fQkJC5Onpqbp16+rdd9+1W8eV3ov8CgoKUvXq1bV//3676fn9zGdlZSkuLk7h4eHy9vZWq1attGPHjhzX1GdflvHNN99o8ODBCg4OVvny5W3zv/76a1vbKVOmjNq3b6/t27fbbSs5OVl9+/ZV+fLl5eHhobCwMN1zzz22102SfvzxR8XExOiGG26Ql5eXKleurIcffthuPbmNp/HTTz+pbdu28vPzk6+vr1q3bq3vv//erkz2PmzYsEEjR45UUFCQfHx8dO+99+ro0aP5fcnVo0cPbdu2Tbt27bLbt9WrV+d5DM5Pm5CkEydOqE+fPvL391dAQIB69+6d5+UIV3ss2bt3rzp37qzQ0FB5enqqfPny6tatm06ePJm/FwBAiXTHHXdIUo7vi9zk53htjNH48eNVvnx52/fH9u3bc3x/ZPcbLpV9TL54nQXth17az5GktLQ0jRs3TlWrVpWHh4cqVKigMWPG5Bj7KC0tTSNGjFBQUJDKlCmjjh076q+//rria5Nt0aJFuvPOO694xpF0oa9z22236dSpU7ZQ6GKrVq3SW2+9ZXdpvyQ1aNBAgYGBl+0fSxcus88+yzooKCjH9+Qbb7yhWrVqycPDQ+Hh4RoyZEiO75bLvaYFkVc727Rpk2JjY+Xv7y9vb2+1aNFCGzZsyLH82rVr1bBhQ3l6eioyMlKzZs3KtQ1lX7I+b948274tXbpUknTo0CE9/PDDCgkJkYeHh2rVqqV33nknx7amTp2qWrVqydvbW2XLllXDhg3t+qOnTp3S8OHDFRERIQ8PDwUHB+uuu+7S1q1bbWVyG7MpNTVVo0aNUoUKFeTh4aGbb75Zr776ao6/W7L3YdGiRapdu7atrtn7kR9RUVGqXLlyjn70vHnzFBsbq8DAwFyXy0+bkKTZs2crMjJSXl5eatSokb799ttc15ffz92l0tPTFR8fr5tuukmenp4qV66cmjVrZvd3Lq4fXEZXAkRERCgqKkrz58+3XXb29ddf6+TJk+rWrZtef/31HMsMHDhQc+fOVd++ffXYY48pMTFR06ZN008//aQNGzbIzc1N0oUvfl9fX40cOVK+vr5avXq1nnvuOaWkpGjixIl26/z
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1200x600 with 2 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"from sklearn.model_selection import cross_val_score\n",
|
|||
|
"from sklearn.preprocessing import StandardScaler\n",
|
|||
|
"from sklearn.linear_model import LinearRegression, LogisticRegression\n",
|
|||
|
"from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n",
|
|||
|
"from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier\n",
|
|||
|
"from sklearn.pipeline import Pipeline\n",
|
|||
|
"from sklearn.compose import ColumnTransformer\n",
|
|||
|
"from sklearn.preprocessing import OneHotEncoder\n",
|
|||
|
"\n",
|
|||
|
"# Загружаем набор данных\n",
|
|||
|
"df = pd.read_csv(\".//static//csv//Stores.csv\")\n",
|
|||
|
"\n",
|
|||
|
"# Определяем категориальные и числовые столбцы\n",
|
|||
|
"numerical_cols = [\"Store_Area\", \"Items_Available\", \"Store_Sales\"]\n",
|
|||
|
"\n",
|
|||
|
"# Создаем преобразователь для категориальных и числовых столбцов\n",
|
|||
|
"preprocessor = ColumnTransformer(\n",
|
|||
|
" transformers=[\n",
|
|||
|
" ('num', StandardScaler(), numerical_cols)\n",
|
|||
|
" ])\n",
|
|||
|
"\n",
|
|||
|
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи регрессии\n",
|
|||
|
"X_reg = df[numerical_cols]\n",
|
|||
|
"y_reg = df['Daily_Customer_Count']\n",
|
|||
|
"\n",
|
|||
|
"# Список моделей для задачи регрессии\n",
|
|||
|
"models_reg = {\n",
|
|||
|
" \"Linear Regression\": LinearRegression(),\n",
|
|||
|
" \"Random Forest Regression\": RandomForestRegressor(),\n",
|
|||
|
" \"Gradient Boosting Regression\": GradientBoostingRegressor()\n",
|
|||
|
"}\n",
|
|||
|
"\n",
|
|||
|
"# Оценка смещения и дисперсии для задачи регрессии\n",
|
|||
|
"mae_means = []\n",
|
|||
|
"mae_stds = []\n",
|
|||
|
"r2_means = []\n",
|
|||
|
"r2_stds = []\n",
|
|||
|
"\n",
|
|||
|
"for name, model in models_reg.items():\n",
|
|||
|
" pipeline = Pipeline(steps=[\n",
|
|||
|
" ('preprocessor', preprocessor),\n",
|
|||
|
" ('model', model)\n",
|
|||
|
" ])\n",
|
|||
|
" mae_scores = -cross_val_score(pipeline, X_reg, y_reg, cv=5, scoring='neg_mean_absolute_error')\n",
|
|||
|
" r2_scores = cross_val_score(pipeline, X_reg, y_reg, cv=5, scoring='r2')\n",
|
|||
|
" mae_means.append(mae_scores.mean())\n",
|
|||
|
" mae_stds.append(mae_scores.std())\n",
|
|||
|
" r2_means.append(r2_scores.mean())\n",
|
|||
|
" r2_stds.append(r2_scores.std())\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация результатов для задачи регрессии\n",
|
|||
|
"fig, ax = plt.subplots(1, 2, figsize=(12, 6))\n",
|
|||
|
"\n",
|
|||
|
"ax[0].bar(models_reg.keys(), mae_means, yerr=mae_stds, align='center', alpha=0.5, ecolor='black', capsize=10)\n",
|
|||
|
"ax[0].set_ylabel('MAE')\n",
|
|||
|
"ax[0].set_title('Mean Absolute Error (MAE) for Regression Models')\n",
|
|||
|
"ax[0].yaxis.grid(True)\n",
|
|||
|
"\n",
|
|||
|
"ax[1].bar(models_reg.keys(), r2_means, yerr=r2_stds, align='center', alpha=0.5, ecolor='black', capsize=10)\n",
|
|||
|
"ax[1].set_ylabel('R²')\n",
|
|||
|
"ax[1].set_title('R-squared (R²) for Regression Models')\n",
|
|||
|
"ax[1].yaxis.grid(True)\n",
|
|||
|
"\n",
|
|||
|
"plt.tight_layout()\n",
|
|||
|
"plt.show()\n",
|
|||
|
"\n",
|
|||
|
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи классификации\n",
|
|||
|
"X_class = df[numerical_cols]\n",
|
|||
|
"y_class = (df['Daily_Customer_Count'] > df['Daily_Customer_Count'].mean()).astype(int)\n",
|
|||
|
"\n",
|
|||
|
"# Список моделей для задачи классификации\n",
|
|||
|
"models_class = {\n",
|
|||
|
" \"Logistic Regression\": LogisticRegression(),\n",
|
|||
|
" \"Random Forest Classification\": RandomForestClassifier(),\n",
|
|||
|
" \"Gradient Boosting Classification\": GradientBoostingClassifier()\n",
|
|||
|
"}\n",
|
|||
|
"\n",
|
|||
|
"# Оценка смещения и дисперсии для задачи классификации\n",
|
|||
|
"accuracy_means = []\n",
|
|||
|
"accuracy_stds = []\n",
|
|||
|
"precision_means = []\n",
|
|||
|
"precision_stds = []\n",
|
|||
|
"recall_means = []\n",
|
|||
|
"recall_stds = []\n",
|
|||
|
"f1_means = []\n",
|
|||
|
"f1_stds = []\n",
|
|||
|
"\n",
|
|||
|
"for name, model in models_class.items():\n",
|
|||
|
" pipeline = Pipeline(steps=[\n",
|
|||
|
" ('preprocessor', preprocessor),\n",
|
|||
|
" ('model', model)\n",
|
|||
|
" ])\n",
|
|||
|
" accuracy_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='accuracy')\n",
|
|||
|
" precision_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='precision')\n",
|
|||
|
" recall_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='recall')\n",
|
|||
|
" f1_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='f1')\n",
|
|||
|
" accuracy_means.append(accuracy_scores.mean())\n",
|
|||
|
" accuracy_stds.append(accuracy_scores.std())\n",
|
|||
|
" precision_means.append(precision_scores.mean())\n",
|
|||
|
" precision_stds.append(precision_scores.std())\n",
|
|||
|
" recall_means.append(recall_scores.mean())\n",
|
|||
|
" recall_stds.append(recall_scores.std())\n",
|
|||
|
" f1_means.append(f1_scores.mean())\n",
|
|||
|
" f1_stds.append(f1_scores.std())\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация результатов для задачи классификации\n",
|
|||
|
"fig, ax = plt.subplots(2, 2, figsize=(12, 12))\n",
|
|||
|
"\n",
|
|||
|
"ax[0, 0].bar(models_class.keys(), accuracy_means, yerr=accuracy_stds, align='center', alpha=0.5, ecolor='black', capsize=10)\n",
|
|||
|
"ax[0, 0].set_ylabel('Accuracy')\n",
|
|||
|
"ax[0, 0].set_title('Accuracy for Classification Models')\n",
|
|||
|
"ax[0, 0].yaxis.grid(True)\n",
|
|||
|
"\n",
|
|||
|
"ax[0, 1].bar(models_class.keys(), precision_means, yerr=precision_stds, align='center', alpha=0.5, ecolor='black', capsize=10)\n",
|
|||
|
"ax[0, 1].set_ylabel('Precision')\n",
|
|||
|
"ax[0, 1].set_title('Precision for Classification Models')\n",
|
|||
|
"ax[0, 1].yaxis.grid(True)\n",
|
|||
|
"\n",
|
|||
|
"ax[1, 0].bar(models_class.keys(), recall_means, yerr=recall_stds, align='center', alpha=0.5, ecolor='black', capsize=10)\n",
|
|||
|
"ax[1, 0].set_ylabel('Recall')\n",
|
|||
|
"ax[1, 0].set_title('Recall for Classification Models')\n",
|
|||
|
"ax[1, 0].yaxis.grid(True)\n",
|
|||
|
"\n",
|
|||
|
"ax[1, 1].bar(models_class.keys(), f1_means, yerr=f1_stds, align='center', alpha=0.5, ecolor='black', capsize=10)\n",
|
|||
|
"ax[1, 1].set_ylabel('F1-score')\n",
|
|||
|
"ax[1, 1].set_title('F1-score for Classification Models')\n",
|
|||
|
"ax[1, 1].yaxis.grid(True)\n",
|
|||
|
"\n",
|
|||
|
"plt.tight_layout()\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": "aisenv",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.12.6"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|