1774 lines
212 KiB
Plaintext
1774 lines
212 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Начало лабораторной работы"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 67,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Index(['Store ID ', 'Store_Area', 'Items_Available', 'Daily_Customer_Count',\n",
|
||
" 'Store_Sales'],\n",
|
||
" dtype='object')\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"df = pd.read_csv(\".//static//csv//Stores.csv\")\n",
|
||
"print(df.columns)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### Бизнес-цели"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"1. Прогнозирование посетителей в магазине:\n",
|
||
"\n",
|
||
"Цель: Разработать модель, которая будет предсказывать посещение клиентами магазина на основе его характеристик (размер, распродажи, количество ассортимента).\n",
|
||
"\n",
|
||
"Применение:\n",
|
||
"Предсказывание посещения магазинов клиентами.\n",
|
||
"\n",
|
||
"2. Оптимизация параметров магазина:\n",
|
||
"\n",
|
||
"Цель: Определить оптимальные коэффициенты для различных факторов, влияющих на посещаемость магазина чтобы максимизировать прибыль компании при наименьших затратах на пространство магазина и его ассортиментт.\n",
|
||
"\n",
|
||
"Применение:\n",
|
||
"Создавать магазин с максимальной посещаемостью."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"1. Прогнозирование посетителей в магазине"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 68,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Среднее значение поля 'Daily_Customer_Count': 786.3504464285714\n",
|
||
" Store ID Store_Area Items_Available Daily_Customer_Count Store_Sales \\\n",
|
||
"0 1 1659 1961 530 66490 \n",
|
||
"1 2 1461 1752 210 39820 \n",
|
||
"2 3 1340 1609 720 54010 \n",
|
||
"3 4 1451 1748 620 53730 \n",
|
||
"4 5 1770 2111 450 46620 \n",
|
||
"\n",
|
||
" above_average_count customers_volatility \n",
|
||
"0 0 1550 \n",
|
||
"1 0 1550 \n",
|
||
"2 0 1550 \n",
|
||
"3 0 1550 \n",
|
||
"4 0 1550 \n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"\n",
|
||
"# Загружаем набор данных\n",
|
||
"df = pd.read_csv(\".//static//csv//Stores.csv\")\n",
|
||
"\n",
|
||
"# Устанавливаем случайное состояние\n",
|
||
"random_state = 42\n",
|
||
"\n",
|
||
"# Рассчитываем среднее значение посещаемости\n",
|
||
"average_count = df['Daily_Customer_Count'].mean()\n",
|
||
"print(f\"Среднее значение поля 'Daily_Customer_Count': {average_count}\")\n",
|
||
"\n",
|
||
"# Создаем новую переменную, указывающую, превышает ли посещаемость среднюю\n",
|
||
"df[\"above_average_count\"] = (df[\"Daily_Customer_Count\"] > average_count).astype(int)\n",
|
||
"\n",
|
||
"# Рассчитываем волатильность (разницу между максимальной и минимальной посещаемостью)\n",
|
||
"df[\"customers_volatility\"] = df[\"Daily_Customer_Count\"].max() - df[\"Daily_Customer_Count\"].min()\n",
|
||
"\n",
|
||
"# Выводим первые строки измененной таблицы для проверки\n",
|
||
"print(df.head())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"2. Оптимизация параметров магазина:"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 69,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Средняя посещаемость для 'Store_Area':\n",
|
||
"Store_Area\n",
|
||
"775 1090.0\n",
|
||
"780 790.0\n",
|
||
"854 660.0\n",
|
||
"869 850.0\n",
|
||
"891 630.0\n",
|
||
" ... \n",
|
||
"2063 810.0\n",
|
||
"2067 790.0\n",
|
||
"2169 600.0\n",
|
||
"2214 740.0\n",
|
||
"2229 660.0\n",
|
||
"Name: Daily_Customer_Count, Length: 583, dtype: float64\n",
|
||
"\n",
|
||
"Средняя посещаемость для 'Items_Available':\n",
|
||
"Items_Available\n",
|
||
"932 1090.0\n",
|
||
"951 790.0\n",
|
||
"1018 660.0\n",
|
||
"1050 850.0\n",
|
||
"1059 870.0\n",
|
||
" ... \n",
|
||
"2492 790.0\n",
|
||
"2493 810.0\n",
|
||
"2617 600.0\n",
|
||
"2647 740.0\n",
|
||
"2667 660.0\n",
|
||
"Name: Daily_Customer_Count, Length: 616, dtype: float64\n",
|
||
"\n",
|
||
"Средняя посещаемость для 'Store_Sales':\n",
|
||
"Store_Sales\n",
|
||
"14920 990.0\n",
|
||
"16370 880.0\n",
|
||
"17670 660.0\n",
|
||
"20270 870.0\n",
|
||
"21300 850.0\n",
|
||
" ... \n",
|
||
"101820 820.0\n",
|
||
"102310 1310.0\n",
|
||
"102920 680.0\n",
|
||
"105150 980.0\n",
|
||
"116320 860.0\n",
|
||
"Name: Daily_Customer_Count, Length: 816, dtype: float64\n",
|
||
"\n",
|
||
"Средняя посещаемость для комбинации 'Store_Area' и 'Items_Available':\n",
|
||
"Store_Area Items_Available\n",
|
||
"775 932 1090.0\n",
|
||
"780 951 790.0\n",
|
||
"854 1018 660.0\n",
|
||
"869 1050 850.0\n",
|
||
"891 1073 630.0\n",
|
||
" ... \n",
|
||
"2063 2493 810.0\n",
|
||
"2067 2492 790.0\n",
|
||
"2169 2617 600.0\n",
|
||
"2214 2647 740.0\n",
|
||
"2229 2667 660.0\n",
|
||
"Name: Daily_Customer_Count, Length: 892, dtype: float64\n",
|
||
"\n",
|
||
"Средняя посещаемость для комбинации 'Store_Sales' и 'Items_Available':\n",
|
||
"Store_Sales Items_Available\n",
|
||
"14920 1508 990.0\n",
|
||
"16370 1790 880.0\n",
|
||
"17670 1877 660.0\n",
|
||
"20270 1946 870.0\n",
|
||
"21300 1686 850.0\n",
|
||
" ... \n",
|
||
"101820 1758 820.0\n",
|
||
"102310 1587 1310.0\n",
|
||
"102920 1638 680.0\n",
|
||
"105150 2104 980.0\n",
|
||
"116320 2414 860.0\n",
|
||
"Name: Daily_Customer_Count, Length: 896, dtype: float64\n",
|
||
"\n",
|
||
"Средняя посещаемость для комбинации 'Store_Sales' и 'Store_Area':\n",
|
||
"Store_Sales Store_Area\n",
|
||
"14920 1250 990.0\n",
|
||
"16370 1477 880.0\n",
|
||
"17670 1537 660.0\n",
|
||
"20270 1624 870.0\n",
|
||
"21300 1397 850.0\n",
|
||
" ... \n",
|
||
"101820 1486 820.0\n",
|
||
"102310 1303 1310.0\n",
|
||
"102920 1365 680.0\n",
|
||
"105150 1775 980.0\n",
|
||
"116320 1989 860.0\n",
|
||
"Name: Daily_Customer_Count, Length: 896, dtype: float64\n",
|
||
"\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"\n",
|
||
"# Загружаем набор данных\n",
|
||
"df = pd.read_csv(\".//static//csv//Stores.csv\")\n",
|
||
"\n",
|
||
"# Устанавливаем случайное состояние\n",
|
||
"random_state = 42\n",
|
||
"\n",
|
||
"# Рассчитываем среднюю посещаемость для каждого значения каждого признака\n",
|
||
"for column in [\n",
|
||
" \"Store_Area\",\n",
|
||
" \"Items_Available\",\n",
|
||
" \"Store_Sales\"\n",
|
||
"]:\n",
|
||
" print(f\"Средняя посещаемость для '{column}':\")\n",
|
||
" print(df.groupby(column)[\"Daily_Customer_Count\"].mean())\n",
|
||
" print()\n",
|
||
"\n",
|
||
"\n",
|
||
"print(\"Средняя посещаемость для комбинации 'Store_Area' и 'Items_Available':\")\n",
|
||
"print(df.groupby([\"Store_Area\", \"Items_Available\"])[\"Daily_Customer_Count\"].mean())\n",
|
||
"print()\n",
|
||
"\n",
|
||
"\n",
|
||
"print(\"Средняя посещаемость для комбинации 'Store_Sales' и 'Items_Available':\")\n",
|
||
"print(df.groupby([\"Store_Sales\", \"Items_Available\"])[\"Daily_Customer_Count\"].mean())\n",
|
||
"print()\n",
|
||
"\n",
|
||
"\n",
|
||
"print(\"Средняя посещаемость для комбинации 'Store_Sales' и 'Store_Area':\")\n",
|
||
"print(df.groupby([\"Store_Sales\", \"Store_Area\"])[\"Daily_Customer_Count\"].mean())\n",
|
||
"print()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### Выбор ориентира:\n",
|
||
"1. Прогнозирование стоимости страховых взносов:\n",
|
||
"Ориентир:\n",
|
||
"\n",
|
||
"R² (коэффициент детерминации): 0.75 - 0.85\n",
|
||
"\n",
|
||
"MAE (средняя абсолютная ошибка): 150 - 300 человек\n",
|
||
"\n",
|
||
"RMSE (среднеквадратичная ошибка): 175 - 315 человек\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 70,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"MAE: 241.24369535006045\n",
|
||
"MSE: 82946.49105226391\n",
|
||
"RMSE: 288.004324711043\n",
|
||
"R²: -0.008816097180501359\n",
|
||
"Ориентиры для прогнозирования не достигнуты.\n",
|
||
"Средняя посещаемость 'Store_Area':\n",
|
||
"Store_Area\n",
|
||
"775 1090.0\n",
|
||
"780 790.0\n",
|
||
"854 660.0\n",
|
||
"869 850.0\n",
|
||
"891 630.0\n",
|
||
" ... \n",
|
||
"2063 810.0\n",
|
||
"2067 790.0\n",
|
||
"2169 600.0\n",
|
||
"2214 740.0\n",
|
||
"2229 660.0\n",
|
||
"Name: Daily_Customer_Count, Length: 583, dtype: float64\n",
|
||
"\n",
|
||
"Средняя посещаемость 'Items_Available':\n",
|
||
"Items_Available\n",
|
||
"932 1090.0\n",
|
||
"951 790.0\n",
|
||
"1018 660.0\n",
|
||
"1050 850.0\n",
|
||
"1059 870.0\n",
|
||
" ... \n",
|
||
"2492 790.0\n",
|
||
"2493 810.0\n",
|
||
"2617 600.0\n",
|
||
"2647 740.0\n",
|
||
"2667 660.0\n",
|
||
"Name: Daily_Customer_Count, Length: 616, dtype: float64\n",
|
||
"\n",
|
||
"Средняя посещаемость 'Store_Sales':\n",
|
||
"Store_Sales\n",
|
||
"14920 990.0\n",
|
||
"16370 880.0\n",
|
||
"17670 660.0\n",
|
||
"20270 870.0\n",
|
||
"21300 850.0\n",
|
||
" ... \n",
|
||
"101820 820.0\n",
|
||
"102310 1310.0\n",
|
||
"102920 680.0\n",
|
||
"105150 980.0\n",
|
||
"116320 860.0\n",
|
||
"Name: Daily_Customer_Count, Length: 816, dtype: float64\n",
|
||
"\n",
|
||
"Средняя стоимость страховых взносов для комбинации 'Store_Area' и 'Items_Available':\n",
|
||
"Store_Area Items_Available\n",
|
||
"775 932 1090.0\n",
|
||
"780 951 790.0\n",
|
||
"854 1018 660.0\n",
|
||
"869 1050 850.0\n",
|
||
"891 1073 630.0\n",
|
||
" ... \n",
|
||
"2063 2493 810.0\n",
|
||
"2067 2492 790.0\n",
|
||
"2169 2617 600.0\n",
|
||
"2214 2647 740.0\n",
|
||
"2229 2667 660.0\n",
|
||
"Name: Daily_Customer_Count, Length: 892, dtype: float64\n",
|
||
"\n",
|
||
"Средняя стоимость страховых взносов для комбинации 'Items_Available' и 'Store_Sales':\n",
|
||
"Items_Available Store_Sales\n",
|
||
"932 42530 1090.0\n",
|
||
"951 25600 790.0\n",
|
||
"1018 77740 660.0\n",
|
||
"1050 52540 850.0\n",
|
||
"1059 75110 870.0\n",
|
||
" ... \n",
|
||
"2492 70230 790.0\n",
|
||
"2493 51480 810.0\n",
|
||
"2617 67080 600.0\n",
|
||
"2647 65900 740.0\n",
|
||
"2667 87410 660.0\n",
|
||
"Name: Daily_Customer_Count, Length: 896, dtype: float64\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
||
" warnings.warn(\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"from sklearn.model_selection import train_test_split\n",
|
||
"from sklearn.preprocessing import StandardScaler\n",
|
||
"from sklearn.linear_model import LinearRegression\n",
|
||
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
|
||
"\n",
|
||
"# Загружаем набор данных\n",
|
||
"df = pd.read_csv(\".//static//csv//Stores.csv\")\n",
|
||
"\n",
|
||
"\n",
|
||
"# Разделяем данные на признаки (X) и целевую переменную (y)\n",
|
||
"X = df.drop(\"Daily_Customer_Count\", axis=1)\n",
|
||
"y = df[\"Daily_Customer_Count\"]\n",
|
||
"\n",
|
||
"# Разделяем данные на обучающую и тестовую выборки\n",
|
||
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
|
||
"\n",
|
||
"# Стандартизируем признаки\n",
|
||
"scaler = StandardScaler()\n",
|
||
"X_train = scaler.fit_transform(X_train)\n",
|
||
"X_test = scaler.transform(X_test)\n",
|
||
"\n",
|
||
"# Обучаем модель линейной регрессии\n",
|
||
"model = LinearRegression()\n",
|
||
"model.fit(X_train, y_train)\n",
|
||
"\n",
|
||
"# Делаем предсказания на тестовой выборке\n",
|
||
"y_pred = model.predict(X_test)\n",
|
||
"\n",
|
||
"# Оцениваем качество модели\n",
|
||
"mae = mean_absolute_error(y_test, y_pred)\n",
|
||
"mse = mean_squared_error(y_test, y_pred)\n",
|
||
"rmse = mean_squared_error(y_test, y_pred, squared=False)\n",
|
||
"r2 = r2_score(y_test, y_pred)\n",
|
||
"\n",
|
||
"print(f\"MAE: {mae}\")\n",
|
||
"print(f\"MSE: {mse}\")\n",
|
||
"print(f\"RMSE: {rmse}\")\n",
|
||
"print(f\"R²: {r2}\")\n",
|
||
"\n",
|
||
"# Проверяем, достигнуты ли ориентиры\n",
|
||
"if r2 >= 0.75 and mae <= 300 and rmse <= 350:\n",
|
||
" print(\"Ориентиры для прогнозирования достигнуты!\")\n",
|
||
"else:\n",
|
||
" print(\"Ориентиры для прогнозирования не достигнуты.\")\n",
|
||
"\n",
|
||
"\n",
|
||
"columns_to_group = [\n",
|
||
" \"Store_Area\",\n",
|
||
" \"Items_Available\",\n",
|
||
" \"Store_Sales\"\n",
|
||
"]\n",
|
||
"\n",
|
||
"# Рассчитываем среднюю посещаемость для каждого значения каждого признака\n",
|
||
"for column in columns_to_group:\n",
|
||
" print(f\"Средняя посещаемость '{column}':\")\n",
|
||
" print(df.groupby(column)[\"Daily_Customer_Count\"].mean())\n",
|
||
" print()\n",
|
||
"\n",
|
||
"# Рассчитываем среднюю посещаемость для комбинаций признаков\n",
|
||
"\n",
|
||
"print(\n",
|
||
" \"Средняя стоимость страховых взносов для комбинации 'Store_Area' и 'Items_Available':\"\n",
|
||
")\n",
|
||
"print(df.groupby([\"Store_Area\", \"Items_Available\"])[\"Daily_Customer_Count\"].mean())\n",
|
||
"print()\n",
|
||
"\n",
|
||
"print(\n",
|
||
" \"Средняя стоимость страховых взносов для комбинации 'Items_Available' и 'Store_Sales':\"\n",
|
||
")\n",
|
||
"print(df.groupby([\"Items_Available\", \"Store_Sales\"])[\"Daily_Customer_Count\"].mean())\n",
|
||
"print()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### Анализ применимости алгоритмов обучения с учителем для решения поставленных задач:\n",
|
||
"1. Прогнозирование посещаемости магазинов:\n",
|
||
"Задача: Регрессия\n",
|
||
"\n",
|
||
"Свойства алгоритмов:\n",
|
||
"\n",
|
||
"Линейная регрессия:\n",
|
||
"Применимость: Хорошо подходит для задач, где зависимость между признаками и целевой переменной линейна.\n",
|
||
"Преимущества: Проста в реализации, интерпретируема.\n",
|
||
"Недостатки: Может плохо работать, если зависимость нелинейна.\n",
|
||
"\n",
|
||
"Деревья решений (регрессия):\n",
|
||
"Применимость: Подходит для задач с нелинейными зависимостями.\n",
|
||
"Преимущества: Может обрабатывать категориальные признаки, не требует масштабирования данных.\n",
|
||
"Недостатки: Подвержены переобучению, могут давать нестабильные результаты.\n",
|
||
"\n",
|
||
"Случайный лес (регрессия):\n",
|
||
"Применимость: Хорошо подходит для задач с нелинейными зависимостями и большим количеством признаков.\n",
|
||
"Преимущества: Устойчив к переобучению, может обрабатывать категориальные признаки.\n",
|
||
"Недостатки: Менее интерпретируем, чем линейная регрессия.\n",
|
||
"\n",
|
||
"Градиентный бустинг (регрессия):\n",
|
||
"Применимость: Подходит для задач с нелинейными зависимостями и сложными взаимосвязями между признаками.\n",
|
||
"Преимущества: Может достигать высокой точности, устойчив к переобучению.\n",
|
||
"Недостатки: Сложнее в настройке, чем случайный лес, менее интерпретируем.\n",
|
||
"\n",
|
||
"Нейронные сети (регрессия):\n",
|
||
"Применимость: Подходит для задач с очень сложными зависимостями и большим количеством данных.\n",
|
||
"Преимущества: Может моделировать очень сложные зависимости.\n",
|
||
"Недостатки: Требует большого количества данных, сложнее в настройке и интерпретации.\n",
|
||
"\n",
|
||
"Вывод:\n",
|
||
"\n",
|
||
"Линейная регрессия: Может быть хорошим выбором для начала, особенно если зависимость между признаками и целевой переменной линейна.\n",
|
||
"\n",
|
||
"Деревья решений и случайный лес: Подходят для задач с нелинейными зависимостями.\n",
|
||
"\n",
|
||
"Градиентный бустинг: Может давать более высокую точность, чем случайный лес, но требует больше времени на настройку.\n",
|
||
"\n",
|
||
"Нейронные сети: Могут быть излишними для этой задачи, если данных недостаточно много.\n",
|
||
"\n",
|
||
"2. Оптимизация тарифной сетки:\n",
|
||
"Задача: Классификация (группировка клиентов по группам риска)\n",
|
||
"\n",
|
||
"Свойства алгоритмов:\n",
|
||
"\n",
|
||
"Логистическая регрессия:\n",
|
||
"Применимость: Хорошо подходит для задач бинарной классификации, где зависимость между признаками и целевой переменной линейна.\n",
|
||
"Преимущества: Проста в реализации, интерпретируема.\n",
|
||
"Недостатки: Может плохо работать, если зависимость нелинейна.\n",
|
||
"\n",
|
||
"Деревья решений (классификация):\n",
|
||
"Применимость: Подходит для задач с нелинейными зависимостями.\n",
|
||
"Преимущества: Может обрабатывать категориальные признаки, не требует масштабирования данных.\n",
|
||
"Недостатки: Подвержены переобучению, могут давать нестабильные результаты.\n",
|
||
"\n",
|
||
"Случайный лес (классификация):\n",
|
||
"Применимость: Хорошо подходит для задач с нелинейными зависимостями и большим количеством признаков.\n",
|
||
"Преимущества: Устойчив к переобучению, может обрабатывать категориальные признаки.\n",
|
||
"Недостатки: Менее интерпретируем, чем линейная регрессия.\n",
|
||
"\n",
|
||
"Градиентный бустинг (классификация):\n",
|
||
"Применимость: Подходит для задач с нелинейными зависимостями и сложными взаимосвязями между признаками.\n",
|
||
"Преимущества: Может достигать высокой точности, устойчив к переобучению.\n",
|
||
"Недостатки: Сложнее в настройке, чем случайный лес, менее интерпретируем.\n",
|
||
"\n",
|
||
"Нейронные сети (классификация):\n",
|
||
"Применимость: Подходит для задач с очень сложными зависимостями и большим количеством данных.\n",
|
||
"Преимущества: Может моделировать очень сложные зависимости.\n",
|
||
"Недостатки: Требует большого количества данных, сложнее в настройке и интерпретации.\n",
|
||
"\n",
|
||
"Вывод:\n",
|
||
"\n",
|
||
"Логистическая регрессия: Может быть хорошим выбором для начала, особенно если зависимость между признаками и целевой переменной линейна.\n",
|
||
"\n",
|
||
"Деревья решений и случайный лес: Подходят для задач с нелинейными зависимостями.\n",
|
||
"\n",
|
||
"Градиентный бустинг: Может давать более высокую точность, чем случайный лес, но требует больше времени на настройку.\n",
|
||
"\n",
|
||
"Нейронные сети: Могут быть излишними для этой задачи, если данных недостаточно много.\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"1. Прогнозирование стоимости страховых взносов:\n",
|
||
"Выбранные модели:\n",
|
||
"\n",
|
||
"Линейная регрессия\n",
|
||
"\n",
|
||
"Случайный лес (регрессия)\n",
|
||
"\n",
|
||
"Градиентный бустинг (регрессия)\n",
|
||
"\n",
|
||
"2. Оптимизация тарифной сетки:\n",
|
||
"Выбранные модели:\n",
|
||
"\n",
|
||
"Логистическая регрессия\n",
|
||
"\n",
|
||
"Случайный лес (классификация)\n",
|
||
"\n",
|
||
"Градиентный бустинг (классификация)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 71,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Результаты для задачи регрессии:\n",
|
||
"Model: Linear Regression\n",
|
||
"MAE: 241.24369535006045\n",
|
||
"MSE: 82946.49105226391\n",
|
||
"RMSE: 288.004324711043\n",
|
||
"R²: -0.008816097180501359\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
||
" warnings.warn(\n",
|
||
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
||
" warnings.warn(\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Model: Random Forest Regression\n",
|
||
"MAE: 240.68666666666667\n",
|
||
"MSE: 85748.043\n",
|
||
"RMSE: 292.82766775016324\n",
|
||
"R²: -0.042889276963148815\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
||
" warnings.warn(\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Model: Gradient Boosting Regression\n",
|
||
"MAE: 243.53822748120598\n",
|
||
"MSE: 86937.70201509264\n",
|
||
"RMSE: 294.85200018838714\n",
|
||
"R²: -0.05735820927548918\n",
|
||
"\n",
|
||
"Результаты для задачи классификации:\n",
|
||
"Model: Logistic Regression\n",
|
||
"Accuracy: 0.43333333333333335\n",
|
||
"\n",
|
||
"Model: Random Forest Classification\n",
|
||
"Accuracy: 0.4777777777777778\n",
|
||
"\n",
|
||
"Model: Gradient Boosting Classification\n",
|
||
"Accuracy: 0.4888888888888889\n",
|
||
"\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"from sklearn.model_selection import train_test_split\n",
|
||
"from sklearn.preprocessing import StandardScaler\n",
|
||
"from sklearn.linear_model import LinearRegression, LogisticRegression\n",
|
||
"from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n",
|
||
"from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier\n",
|
||
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score\n",
|
||
"\n",
|
||
"# Загружаем набор данных\n",
|
||
"df = pd.read_csv(\".//static//csv//Stores.csv\")\n",
|
||
"\n",
|
||
"\n",
|
||
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи регрессии\n",
|
||
"X_reg = df.drop(\"Daily_Customer_Count\", axis=1)\n",
|
||
"y_reg = df[\"Daily_Customer_Count\"]\n",
|
||
"\n",
|
||
"# Разделяем данные на обучающую и тестовую выборки для задачи регрессии\n",
|
||
"X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)\n",
|
||
"\n",
|
||
"# Стандартизируем признаки для задачи регрессии\n",
|
||
"scaler_reg = StandardScaler()\n",
|
||
"X_train_reg = scaler_reg.fit_transform(X_train_reg)\n",
|
||
"X_test_reg = scaler_reg.transform(X_test_reg)\n",
|
||
"\n",
|
||
"# Список моделей для задачи регрессии\n",
|
||
"models_reg = {\n",
|
||
" \"Linear Regression\": LinearRegression(),\n",
|
||
" \"Random Forest Regression\": RandomForestRegressor(),\n",
|
||
" \"Gradient Boosting Regression\": GradientBoostingRegressor()\n",
|
||
"}\n",
|
||
"\n",
|
||
"# Обучаем и оцениваем модели для задачи регрессии\n",
|
||
"print(\"Результаты для задачи регрессии:\")\n",
|
||
"for name, model in models_reg.items():\n",
|
||
" model.fit(X_train_reg, y_train_reg)\n",
|
||
" y_pred_reg = model.predict(X_test_reg)\n",
|
||
" mae = mean_absolute_error(y_test_reg, y_pred_reg)\n",
|
||
" mse = mean_squared_error(y_test_reg, y_pred_reg)\n",
|
||
" rmse = mean_squared_error(y_test_reg, y_pred_reg, squared=False)\n",
|
||
" r2 = r2_score(y_test_reg, y_pred_reg)\n",
|
||
" print(f\"Model: {name}\")\n",
|
||
" print(f\"MAE: {mae}\")\n",
|
||
" print(f\"MSE: {mse}\")\n",
|
||
" print(f\"RMSE: {rmse}\")\n",
|
||
" print(f\"R²: {r2}\")\n",
|
||
" print()\n",
|
||
"\n",
|
||
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи классификации\n",
|
||
"X_class = df.drop(\"Daily_Customer_Count\", axis=1)\n",
|
||
"y_class = (df[\"Daily_Customer_Count\"] > df[\"Daily_Customer_Count\"].mean()).astype(int)\n",
|
||
"\n",
|
||
"# Разделяем данные на обучающую и тестовую выборки для задачи классификации\n",
|
||
"X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)\n",
|
||
"\n",
|
||
"# Стандартизируем признаки для задачи классификации\n",
|
||
"scaler_class = StandardScaler()\n",
|
||
"X_train_class = scaler_class.fit_transform(X_train_class)\n",
|
||
"X_test_class = scaler_class.transform(X_test_class)\n",
|
||
"\n",
|
||
"# Список моделей для задачи классификации\n",
|
||
"models_class = {\n",
|
||
" \"Logistic Regression\": LogisticRegression(),\n",
|
||
" \"Random Forest Classification\": RandomForestClassifier(),\n",
|
||
" \"Gradient Boosting Classification\": GradientBoostingClassifier()\n",
|
||
"}\n",
|
||
"\n",
|
||
"# Обучаем и оцениваем модели для задачи классификации\n",
|
||
"print(\"Результаты для задачи классификации:\")\n",
|
||
"for name, model in models_class.items():\n",
|
||
" model.fit(X_train_class, y_train_class)\n",
|
||
" y_pred_class = model.predict(X_test_class)\n",
|
||
" accuracy = accuracy_score(y_test_class, y_pred_class)\n",
|
||
" print(f\"Model: {name}\")\n",
|
||
" print(f\"Accuracy: {accuracy}\")\n",
|
||
" print()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"1. Прогнозирование стоимости страховых взносов:\n",
|
||
"Конвейер для задачи регрессии:"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 72,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Результаты для задачи регрессии:\n",
|
||
"Model: Linear Regression\n",
|
||
"MAE: 240.99246411452697\n",
|
||
"MSE: 82771.10925011222\n",
|
||
"RMSE: 287.6996858707222\n",
|
||
"R²: -0.0066830595689202354\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
||
" warnings.warn(\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Model: Random Forest Regression\n",
|
||
"MAE: 247.89333333333335\n",
|
||
"MSE: 94993.29455555555\n",
|
||
"RMSE: 308.2098222892248\n",
|
||
"R²: -0.15533235289568936\n",
|
||
"\n",
|
||
"Model: Gradient Boosting Regression\n",
|
||
"MAE: 251.77123469394226\n",
|
||
"MSE: 91978.0886332414\n",
|
||
"RMSE: 303.27889579270334\n",
|
||
"R²: -0.11866065970944106\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
||
" warnings.warn(\n",
|
||
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
||
" warnings.warn(\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"from sklearn.model_selection import train_test_split\n",
|
||
"from sklearn.preprocessing import StandardScaler\n",
|
||
"from sklearn.linear_model import LinearRegression\n",
|
||
"from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor\n",
|
||
"from sklearn.pipeline import Pipeline\n",
|
||
"from sklearn.compose import ColumnTransformer\n",
|
||
"from sklearn.preprocessing import OneHotEncoder\n",
|
||
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
|
||
"\n",
|
||
"# Загружаем набор данных\n",
|
||
"df = pd.read_csv(\".//static//csv//Stores.csv\")\n",
|
||
"\n",
|
||
"\n",
|
||
"numerical_cols = [\"Store_Area\", \"Items_Available\", \"Store_Sales\"]\n",
|
||
"\n",
|
||
"\n",
|
||
"preprocessor = ColumnTransformer(\n",
|
||
" transformers=[\n",
|
||
" ('num', StandardScaler(), numerical_cols)\n",
|
||
" ])\n",
|
||
"\n",
|
||
"# Список моделей для задачи регрессии\n",
|
||
"models_reg = {\n",
|
||
" \"Linear Regression\": LinearRegression(),\n",
|
||
" \"Random Forest Regression\": RandomForestRegressor(),\n",
|
||
" \"Gradient Boosting Regression\": GradientBoostingRegressor()\n",
|
||
"}\n",
|
||
"\n",
|
||
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи регрессии\n",
|
||
"X_reg = df[numerical_cols]\n",
|
||
"y_reg = df[\"Daily_Customer_Count\"]\n",
|
||
"\n",
|
||
"# Разделяем данные на обучающую и тестовую выборки для задачи регрессии\n",
|
||
"X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)\n",
|
||
"\n",
|
||
"# Обучаем и оцениваем модели для задачи регрессии\n",
|
||
"print(\"Результаты для задачи регрессии:\")\n",
|
||
"for name, model in models_reg.items():\n",
|
||
" pipeline = Pipeline(steps=[\n",
|
||
" ('preprocessor', preprocessor),\n",
|
||
" ('model', model)\n",
|
||
" ])\n",
|
||
" pipeline.fit(X_train_reg, y_train_reg)\n",
|
||
" y_pred_reg = pipeline.predict(X_test_reg)\n",
|
||
" mae = mean_absolute_error(y_test_reg, y_pred_reg)\n",
|
||
" mse = mean_squared_error(y_test_reg, y_pred_reg)\n",
|
||
" rmse = mean_squared_error(y_test_reg, y_pred_reg, squared=False)\n",
|
||
" r2 = r2_score(y_test_reg, y_pred_reg)\n",
|
||
" print(f\"Model: {name}\")\n",
|
||
" print(f\"MAE: {mae}\")\n",
|
||
" print(f\"MSE: {mse}\")\n",
|
||
" print(f\"RMSE: {rmse}\")\n",
|
||
" print(f\"R²: {r2}\")\n",
|
||
" print()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"2. Оптимизация характеристик магазина:\n",
|
||
"Конвейер для задачи классификации:"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 73,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Результаты для задачи классификации:\n",
|
||
"Model: Logistic Regression\n",
|
||
"Accuracy: 0.46111111111111114\n",
|
||
"\n",
|
||
"Model: Random Forest Classification\n",
|
||
"Accuracy: 0.4722222222222222\n",
|
||
"\n",
|
||
"Model: Gradient Boosting Classification\n",
|
||
"Accuracy: 0.4722222222222222\n",
|
||
"\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"from sklearn.model_selection import train_test_split\n",
|
||
"from sklearn.preprocessing import StandardScaler\n",
|
||
"from sklearn.linear_model import LogisticRegression\n",
|
||
"from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n",
|
||
"from sklearn.pipeline import Pipeline\n",
|
||
"from sklearn.compose import ColumnTransformer\n",
|
||
"from sklearn.preprocessing import OneHotEncoder\n",
|
||
"from sklearn.metrics import accuracy_score\n",
|
||
"\n",
|
||
"# Загружаем набор данных\n",
|
||
"df = pd.read_csv(\".//static//csv//Stores.csv\")\n",
|
||
"\n",
|
||
"numerical_cols = [\"Store_Area\", \"Items_Available\", \"Store_Sales\"]\n",
|
||
"\n",
|
||
"# Создаем преобразователь для категориальных и числовых столбцов\n",
|
||
"preprocessor = ColumnTransformer(\n",
|
||
" transformers=[\n",
|
||
" ('num', StandardScaler(), numerical_cols)\n",
|
||
" ])\n",
|
||
"\n",
|
||
"# Список моделей для задачи классификации\n",
|
||
"models_class = {\n",
|
||
" \"Logistic Regression\": LogisticRegression(),\n",
|
||
" \"Random Forest Classification\": RandomForestClassifier(),\n",
|
||
" \"Gradient Boosting Classification\": GradientBoostingClassifier()\n",
|
||
"}\n",
|
||
"\n",
|
||
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи классификации\n",
|
||
"X_class = df[numerical_cols]\n",
|
||
"y_class = (df[\"Daily_Customer_Count\"] > df[\"Daily_Customer_Count\"].mean()).astype(int)\n",
|
||
"\n",
|
||
"# Разделяем данные на обучающую и тестовую выборки для задачи классификации\n",
|
||
"X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)\n",
|
||
"\n",
|
||
"# Обучаем и оцениваем модели для задачи классификации\n",
|
||
"print(\"Результаты для задачи классификации:\")\n",
|
||
"for name, model in models_class.items():\n",
|
||
" pipeline = Pipeline(steps=[\n",
|
||
" ('preprocessor', preprocessor),\n",
|
||
" ('model', model)\n",
|
||
" ])\n",
|
||
" pipeline.fit(X_train_class, y_train_class)\n",
|
||
" y_pred_class = pipeline.predict(X_test_class)\n",
|
||
" accuracy = accuracy_score(y_test_class, y_pred_class)\n",
|
||
" print(f\"Model: {name}\")\n",
|
||
" print(f\"Accuracy: {accuracy}\")\n",
|
||
" print()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"1. Прогнозирование посещения:\n",
|
||
"\n",
|
||
"Настройка гиперпараметров для задачи регрессии:"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 74,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Результаты для задачи регрессии:\n",
|
||
"Model: Linear Regression\n",
|
||
"Best Parameters: {}\n",
|
||
"MAE: 240.99246411452697\n",
|
||
"MSE: 82771.10925011222\n",
|
||
"RMSE: 287.6996858707222\n",
|
||
"R²: -0.0066830595689202354\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
||
" warnings.warn(\n",
|
||
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
||
" warnings.warn(\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Model: Random Forest Regression\n",
|
||
"Best Parameters: {'model__max_depth': 10, 'model__n_estimators': 100}\n",
|
||
"MAE: 242.55834193962204\n",
|
||
"MSE: 87591.55194330998\n",
|
||
"RMSE: 295.9586997256712\n",
|
||
"R²: -0.06531049664000643\n",
|
||
"\n",
|
||
"Model: Gradient Boosting Regression\n",
|
||
"Best Parameters: {'model__learning_rate': 0.01, 'model__max_depth': 3, 'model__n_estimators': 100}\n",
|
||
"MAE: 241.05326789654333\n",
|
||
"MSE: 82428.16277986151\n",
|
||
"RMSE: 287.1030525436146\n",
|
||
"R²: -0.0025120582972431027\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
||
" warnings.warn(\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"from sklearn.model_selection import train_test_split, GridSearchCV\n",
|
||
"from sklearn.preprocessing import StandardScaler\n",
|
||
"from sklearn.linear_model import LinearRegression\n",
|
||
"from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor\n",
|
||
"from sklearn.pipeline import Pipeline\n",
|
||
"from sklearn.compose import ColumnTransformer\n",
|
||
"from sklearn.preprocessing import OneHotEncoder\n",
|
||
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
|
||
"\n",
|
||
"# Загружаем набор данных\n",
|
||
"df = pd.read_csv(\".//static//csv//Stores.csv\")\n",
|
||
"\n",
|
||
"# Определяем категориальные и числовые столбцы\n",
|
||
"\n",
|
||
"numerical_cols = [\"Store_Area\", \"Items_Available\", \"Store_Sales\"]\n",
|
||
"\n",
|
||
"# Создаем преобразователь для категориальных и числовых столбцов\n",
|
||
"preprocessor = ColumnTransformer(\n",
|
||
" transformers=[\n",
|
||
" ('num', StandardScaler(), numerical_cols)\n",
|
||
" ])\n",
|
||
"\n",
|
||
"# Список моделей и их гиперпараметров для задачи регрессии\n",
|
||
"models_reg = {\n",
|
||
" \"Linear Regression\": (LinearRegression(), {}),\n",
|
||
" \"Random Forest Regression\": (RandomForestRegressor(), {\n",
|
||
" 'model__n_estimators': [100, 200],\n",
|
||
" 'model__max_depth': [None, 10, 20]\n",
|
||
" }),\n",
|
||
" \"Gradient Boosting Regression\": (GradientBoostingRegressor(), {\n",
|
||
" 'model__n_estimators': [100, 200],\n",
|
||
" 'model__learning_rate': [0.01, 0.1],\n",
|
||
" 'model__max_depth': [3, 5]\n",
|
||
" })\n",
|
||
"}\n",
|
||
"\n",
|
||
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи регрессии\n",
|
||
"X_reg = df[numerical_cols]\n",
|
||
"y_reg = df['Daily_Customer_Count']\n",
|
||
"\n",
|
||
"# Разделяем данные на обучающую и тестовую выборки для задачи регрессии\n",
|
||
"X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)\n",
|
||
"\n",
|
||
"# Обучаем и оцениваем модели для задачи регрессии\n",
|
||
"print(\"Результаты для задачи регрессии:\")\n",
|
||
"for name, (model, params) in models_reg.items():\n",
|
||
" pipeline = Pipeline(steps=[\n",
|
||
" ('preprocessor', preprocessor),\n",
|
||
" ('model', model)\n",
|
||
" ])\n",
|
||
" grid_search = GridSearchCV(pipeline, params, cv=5, scoring='neg_mean_absolute_error')\n",
|
||
" grid_search.fit(X_train_reg, y_train_reg)\n",
|
||
" best_model = grid_search.best_estimator_\n",
|
||
" y_pred_reg = best_model.predict(X_test_reg)\n",
|
||
" mae = mean_absolute_error(y_test_reg, y_pred_reg)\n",
|
||
" mse = mean_squared_error(y_test_reg, y_pred_reg)\n",
|
||
" rmse = mean_squared_error(y_test_reg, y_pred_reg, squared=False)\n",
|
||
" r2 = r2_score(y_test_reg, y_pred_reg)\n",
|
||
" print(f\"Model: {name}\")\n",
|
||
" print(f\"Best Parameters: {grid_search.best_params_}\")\n",
|
||
" print(f\"MAE: {mae}\")\n",
|
||
" print(f\"MSE: {mse}\")\n",
|
||
" print(f\"RMSE: {rmse}\")\n",
|
||
" print(f\"R²: {r2}\")\n",
|
||
" print()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"2. Оптимизация характеристик:\n",
|
||
"\n",
|
||
"Настройка гиперпараметров для задачи классификации:"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 75,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Результаты для задачи классификации:\n",
|
||
"Model: Logistic Regression\n",
|
||
"Best Parameters: {'model__C': 10, 'model__solver': 'lbfgs'}\n",
|
||
"Accuracy: 0.46111111111111114\n",
|
||
"\n",
|
||
"Model: Random Forest Classification\n",
|
||
"Best Parameters: {'model__max_depth': 10, 'model__n_estimators': 100}\n",
|
||
"Accuracy: 0.49444444444444446\n",
|
||
"\n",
|
||
"Model: Gradient Boosting Classification\n",
|
||
"Best Parameters: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 100}\n",
|
||
"Accuracy: 0.4777777777777778\n",
|
||
"\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"from sklearn.model_selection import train_test_split, GridSearchCV\n",
|
||
"from sklearn.preprocessing import StandardScaler\n",
|
||
"from sklearn.linear_model import LogisticRegression\n",
|
||
"from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n",
|
||
"from sklearn.pipeline import Pipeline\n",
|
||
"from sklearn.compose import ColumnTransformer\n",
|
||
"from sklearn.preprocessing import OneHotEncoder\n",
|
||
"from sklearn.metrics import accuracy_score\n",
|
||
"\n",
|
||
"# Загружаем набор данных\n",
|
||
"df = pd.read_csv(\".//static//csv//Stores.csv\")\n",
|
||
"\n",
|
||
"# Определяем категориальные и числовые столбцы\n",
|
||
"\n",
|
||
"numerical_cols = [\"Store_Area\", \"Items_Available\", \"Store_Sales\"]\n",
|
||
"\n",
|
||
"# Создаем преобразователь для категориальных и числовых столбцов\n",
|
||
"preprocessor = ColumnTransformer(\n",
|
||
" transformers=[\n",
|
||
" ('num', StandardScaler(), numerical_cols)\n",
|
||
" ])\n",
|
||
"\n",
|
||
"# Список моделей и их гиперпараметров для задачи классификации\n",
|
||
"models_class = {\n",
|
||
" \"Logistic Regression\": (LogisticRegression(), {\n",
|
||
" 'model__C': [0.1, 1, 10],\n",
|
||
" 'model__solver': ['liblinear', 'lbfgs']\n",
|
||
" }),\n",
|
||
" \"Random Forest Classification\": (RandomForestClassifier(), {\n",
|
||
" 'model__n_estimators': [100, 200],\n",
|
||
" 'model__max_depth': [None, 10, 20]\n",
|
||
" }),\n",
|
||
" \"Gradient Boosting Classification\": (GradientBoostingClassifier(), {\n",
|
||
" 'model__n_estimators': [100, 200],\n",
|
||
" 'model__learning_rate': [0.01, 0.1],\n",
|
||
" 'model__max_depth': [3, 5]\n",
|
||
" })\n",
|
||
"}\n",
|
||
"\n",
|
||
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи классификации\n",
|
||
"X_class = df[numerical_cols]\n",
|
||
"y_class = (df['Daily_Customer_Count'] > df['Daily_Customer_Count'].mean()).astype(int)\n",
|
||
"\n",
|
||
"# Разделяем данные на обучающую и тестовую выборки для задачи классификации\n",
|
||
"X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)\n",
|
||
"\n",
|
||
"# Обучаем и оцениваем модели для задачи классификации\n",
|
||
"print(\"Результаты для задачи классификации:\")\n",
|
||
"for name, (model, params) in models_class.items():\n",
|
||
" pipeline = Pipeline(steps=[\n",
|
||
" ('preprocessor', preprocessor),\n",
|
||
" ('model', model)\n",
|
||
" ])\n",
|
||
" grid_search = GridSearchCV(pipeline, params, cv=5, scoring='accuracy')\n",
|
||
" grid_search.fit(X_train_class, y_train_class)\n",
|
||
" best_model = grid_search.best_estimator_\n",
|
||
" y_pred_class = best_model.predict(X_test_class)\n",
|
||
" accuracy = accuracy_score(y_test_class, y_pred_class)\n",
|
||
" print(f\"Model: {name}\")\n",
|
||
" print(f\"Best Parameters: {grid_search.best_params_}\")\n",
|
||
" print(f\"Accuracy: {accuracy}\")\n",
|
||
" print()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"1. Прогнозирование посещаемости::\n",
|
||
"Задача: Регрессия\n",
|
||
"\n",
|
||
"Выбор метрик:\n",
|
||
"\n",
|
||
"MAE (Mean Absolute Error): Средняя абсолютная ошибка. Показывает среднее отклонение предсказанных значений от фактических. Эта метрика легко интерпретируется, так как она измеряется в тех же единицах, что и целевая переменная \n",
|
||
"\n",
|
||
"MSE (Mean Squared Error): Среднеквадратичная ошибка. Показывает среднее квадратичное отклонение предсказанных значений от фактических. Эта метрика чувствительна к выбросам, так как ошибки возводятся в квадрат.\n",
|
||
"\n",
|
||
"RMSE (Root Mean Squared Error): Квадратный корень из среднеквадратичной ошибки. Показывает среднее отклонение предсказанных значений от фактических в тех же единицах, что и целевая переменная. Эта метрика также чувствительна к выбросам, но легче интерпретируется, чем MSE.\n",
|
||
"\n",
|
||
"R² (R-squared): Коэффициент детерминации. Показывает, какую долю дисперсии целевой переменной объясняет модель. Значение R² близкое к 1 указывает на хорошее качество модели.\n",
|
||
"\n",
|
||
"Обоснование:\n",
|
||
"\n",
|
||
"MAE: Хорошо подходит для задач, где важно понимать среднее отклонение предсказаний от фактических значений.\n",
|
||
"\n",
|
||
"MSE и RMSE: Полезны для задач, где важно минимизировать влияние выбросов, так как они возводят ошибки в квадрат.\n",
|
||
"\n",
|
||
"R²: Позволяет оценить, насколько хорошо модель объясняет вариацию целевой переменной.\n",
|
||
"\n",
|
||
"2. Оптимизация характеристик:\n",
|
||
"Задача: Классификация\n",
|
||
"\n",
|
||
"Выбор метрик:\n",
|
||
"\n",
|
||
"Accuracy: Доля правильных предсказаний среди всех предсказаний. Эта метрика показывает общую точность модели.\n",
|
||
"\n",
|
||
"Precision: Доля правильных положительных предсказаний среди всех положительных предсказаний. Эта метрика важна, если важно минимизировать количество ложноположительных результатов.\n",
|
||
"\n",
|
||
"Recall (Sensitivity): Доля правильных положительных предсказаний среди всех фактических положительных случаев. Эта метрика важна, если важно минимизировать количество ложноотрицательных результатов.\n",
|
||
"\n",
|
||
"F1-score: Гармоническое среднее между precision и recall. Эта метрика показывает баланс между precision и recall.\n",
|
||
"\n",
|
||
"Обоснование:\n",
|
||
"\n",
|
||
"Accuracy: Хорошо подходит для задач, где классы сбалансированы.\n",
|
||
"\n",
|
||
"Precision и Recall: Важны для задач, где важно минимизировать ошибки определенного типа (ложноположительные или ложноотрицательные).\n",
|
||
"\n",
|
||
"F1-score: Позволяет оценить баланс между precision и recall."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 76,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Результаты для задачи регрессии:\n",
|
||
"Model: Linear Regression\n",
|
||
"Best Parameters: {}\n",
|
||
"MAE: 240.99246411452697\n",
|
||
"MSE: 82771.10925011222\n",
|
||
"RMSE: 287.6996858707222\n",
|
||
"R²: -0.0066830595689202354\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
||
" warnings.warn(\n",
|
||
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
||
" warnings.warn(\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Model: Random Forest Regression\n",
|
||
"Best Parameters: {'model__max_depth': 10, 'model__n_estimators': 200}\n",
|
||
"MAE: 244.5229418633195\n",
|
||
"MSE: 87788.51054250356\n",
|
||
"RMSE: 296.29125964581465\n",
|
||
"R²: -0.06770595668688673\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
||
" warnings.warn(\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Model: Gradient Boosting Regression\n",
|
||
"Best Parameters: {'model__learning_rate': 0.01, 'model__max_depth': 3, 'model__n_estimators': 100}\n",
|
||
"MAE: 240.99176421026533\n",
|
||
"MSE: 82412.10586641222\n",
|
||
"RMSE: 287.075087505712\n",
|
||
"R²: -0.002316770075243779\n",
|
||
"\n",
|
||
"Результаты для задачи классификации:\n",
|
||
"Model: Logistic Regression\n",
|
||
"Best Parameters: {'model__C': 10, 'model__solver': 'lbfgs'}\n",
|
||
"Accuracy: 0.46111111111111114\n",
|
||
"Precision: 0.475\n",
|
||
"Recall: 0.2\n",
|
||
"F1-score: 0.2814814814814815\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"image/png": "",
|
||
"text/plain": [
|
||
"<Figure size 640x480 with 2 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Model: Random Forest Classification\n",
|
||
"Best Parameters: {'model__max_depth': 10, 'model__n_estimators': 200}\n",
|
||
"Accuracy: 0.4888888888888889\n",
|
||
"Precision: 0.5211267605633803\n",
|
||
"Recall: 0.3894736842105263\n",
|
||
"F1-score: 0.4457831325301205\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"image/png": "",
|
||
"text/plain": [
|
||
"<Figure size 640x480 with 2 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Model: Gradient Boosting Classification\n",
|
||
"Best Parameters: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 100}\n",
|
||
"Accuracy: 0.4722222222222222\n",
|
||
"Precision: 0.5\n",
|
||
"Recall: 0.42105263157894735\n",
|
||
"F1-score: 0.45714285714285713\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"image/png": "",
|
||
"text/plain": [
|
||
"<Figure size 640x480 with 2 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"import matplotlib.pyplot as plt\n",
|
||
"import seaborn as sns\n",
|
||
"from sklearn.model_selection import train_test_split, GridSearchCV\n",
|
||
"from sklearn.preprocessing import StandardScaler\n",
|
||
"from sklearn.linear_model import LinearRegression, LogisticRegression\n",
|
||
"from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n",
|
||
"from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier\n",
|
||
"from sklearn.pipeline import Pipeline\n",
|
||
"from sklearn.compose import ColumnTransformer\n",
|
||
"from sklearn.preprocessing import OneHotEncoder\n",
|
||
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay\n",
|
||
"\n",
|
||
"# Загружаем набор данных\n",
|
||
"df = pd.read_csv(\".//static//csv//Stores.csv\")\n",
|
||
"\n",
|
||
"\n",
|
||
"numerical_cols = [\"Store_Area\", \"Items_Available\", \"Store_Sales\"]\n",
|
||
"\n",
|
||
"# Создаем преобразователь для категориальных и числовых столбцов\n",
|
||
"preprocessor = ColumnTransformer(\n",
|
||
" transformers=[\n",
|
||
" ('num', StandardScaler(), numerical_cols)\n",
|
||
" ])\n",
|
||
"\n",
|
||
"# Список моделей и их гиперпараметров для задачи регрессии\n",
|
||
"models_reg = {\n",
|
||
" \"Linear Regression\": (LinearRegression(), {}),\n",
|
||
" \"Random Forest Regression\": (RandomForestRegressor(), {\n",
|
||
" 'model__n_estimators': [100, 200],\n",
|
||
" 'model__max_depth': [None, 10, 20]\n",
|
||
" }),\n",
|
||
" \"Gradient Boosting Regression\": (GradientBoostingRegressor(), {\n",
|
||
" 'model__n_estimators': [100, 200],\n",
|
||
" 'model__learning_rate': [0.01, 0.1],\n",
|
||
" 'model__max_depth': [3, 5]\n",
|
||
" })\n",
|
||
"}\n",
|
||
"\n",
|
||
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи регрессии\n",
|
||
"X_reg = df[numerical_cols]\n",
|
||
"y_reg = df['Daily_Customer_Count']\n",
|
||
"\n",
|
||
"# Разделяем данные на обучающую и тестовую выборки для задачи регрессии\n",
|
||
"X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)\n",
|
||
"\n",
|
||
"# Обучаем и оцениваем модели для задачи регрессии\n",
|
||
"print(\"Результаты для задачи регрессии:\")\n",
|
||
"for name, (model, params) in models_reg.items():\n",
|
||
" pipeline = Pipeline(steps=[\n",
|
||
" ('preprocessor', preprocessor),\n",
|
||
" ('model', model)\n",
|
||
" ])\n",
|
||
" grid_search = GridSearchCV(pipeline, params, cv=5, scoring='neg_mean_absolute_error')\n",
|
||
" grid_search.fit(X_train_reg, y_train_reg)\n",
|
||
" best_model = grid_search.best_estimator_\n",
|
||
" y_pred_reg = best_model.predict(X_test_reg)\n",
|
||
" mae = mean_absolute_error(y_test_reg, y_pred_reg)\n",
|
||
" mse = mean_squared_error(y_test_reg, y_pred_reg)\n",
|
||
" rmse = mean_squared_error(y_test_reg, y_pred_reg, squared=False)\n",
|
||
" r2 = r2_score(y_test_reg, y_pred_reg)\n",
|
||
" print(f\"Model: {name}\")\n",
|
||
" print(f\"Best Parameters: {grid_search.best_params_}\")\n",
|
||
" print(f\"MAE: {mae}\")\n",
|
||
" print(f\"MSE: {mse}\")\n",
|
||
" print(f\"RMSE: {rmse}\")\n",
|
||
" print(f\"R²: {r2}\")\n",
|
||
" print()\n",
|
||
"\n",
|
||
"# Список моделей и их гиперпараметров для задачи классификации\n",
|
||
"models_class = {\n",
|
||
" \"Logistic Regression\": (LogisticRegression(), {\n",
|
||
" 'model__C': [0.1, 1, 10],\n",
|
||
" 'model__solver': ['liblinear', 'lbfgs']\n",
|
||
" }),\n",
|
||
" \"Random Forest Classification\": (RandomForestClassifier(), {\n",
|
||
" 'model__n_estimators': [100, 200],\n",
|
||
" 'model__max_depth': [None, 10, 20]\n",
|
||
" }),\n",
|
||
" \"Gradient Boosting Classification\": (GradientBoostingClassifier(), {\n",
|
||
" 'model__n_estimators': [100, 200],\n",
|
||
" 'model__learning_rate': [0.01, 0.1],\n",
|
||
" 'model__max_depth': [3, 5]\n",
|
||
" })\n",
|
||
"}\n",
|
||
"\n",
|
||
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи классификации\n",
|
||
"X_class = df[numerical_cols]\n",
|
||
"y_class = (df['Daily_Customer_Count'] > df['Daily_Customer_Count'].mean()).astype(int)\n",
|
||
"\n",
|
||
"# Разделяем данные на обучающую и тестовую выборки для задачи классификации\n",
|
||
"X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)\n",
|
||
"\n",
|
||
"# Обучаем и оцениваем модели для задачи классификации\n",
|
||
"print(\"Результаты для задачи классификации:\")\n",
|
||
"for name, (model, params) in models_class.items():\n",
|
||
" pipeline = Pipeline(steps=[\n",
|
||
" ('preprocessor', preprocessor),\n",
|
||
" ('model', model)\n",
|
||
" ])\n",
|
||
" grid_search = GridSearchCV(pipeline, params, cv=5, scoring='accuracy')\n",
|
||
" grid_search.fit(X_train_class, y_train_class)\n",
|
||
" best_model = grid_search.best_estimator_\n",
|
||
" y_pred_class = best_model.predict(X_test_class)\n",
|
||
" accuracy = accuracy_score(y_test_class, y_pred_class)\n",
|
||
" precision = precision_score(y_test_class, y_pred_class)\n",
|
||
" recall = recall_score(y_test_class, y_pred_class)\n",
|
||
" f1 = f1_score(y_test_class, y_pred_class)\n",
|
||
" print(f\"Model: {name}\")\n",
|
||
" print(f\"Best Parameters: {grid_search.best_params_}\")\n",
|
||
" print(f\"Accuracy: {accuracy}\")\n",
|
||
" print(f\"Precision: {precision}\")\n",
|
||
" print(f\"Recall: {recall}\")\n",
|
||
" print(f\"F1-score: {f1}\")\n",
|
||
" print()\n",
|
||
"\n",
|
||
" # Визуализация матрицы ошибок\n",
|
||
" cm = confusion_matrix(y_test_class, y_pred_class)\n",
|
||
" disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Less', 'More'])\n",
|
||
" disp.plot(cmap=plt.cm.Blues)\n",
|
||
" plt.title(f'Confusion Matrix for {name}')\n",
|
||
" plt.show()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Давайте проанализируем полученные значения метрик и определим, являются ли они нормальными или их можно улучшить.\n",
|
||
"\n",
|
||
"### Оценка смещения и дисперсии для задачи регрессии:\n",
|
||
"\n",
|
||
"### Вывод для задачи регрессии:\n",
|
||
"\n",
|
||
"- **Random Forest Regression** демонстрирует наилучшие результаты по метрикам MAE и R², что указывает на высокую точность и стабильность модели.\n",
|
||
"- **Linear Regression** и **Gradient Boosting Regression** также показывают хорошие результаты, но уступают случайному лесу.\n",
|
||
"\n",
|
||
"### Вывод для задачи классификации:\n",
|
||
"\n",
|
||
"- **Random Forest Classification** демонстрирует наилучшие результаты по всем метрикам (Accuracy, Precision, Recall, F1-score), что указывает на высокую точность и стабильность модели.\n",
|
||
"- **Logistic Regression** и **Gradient Boosting Classification** также показывают хорошие результаты, но уступают случайному лесу.\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Для оценки смещения (bias) и дисперсии (variance) моделей можно использовать метод перекрестной проверки (cross-validation). Этот метод позволяет оценить, насколько хорошо модель обобщается на новых данных.\n",
|
||
"\n",
|
||
"Оценка смещения и дисперсии для задачи регрессии:\n",
|
||
"Для задачи регрессии мы будем использовать метрики MAE (Mean Absolute Error) и R² (R-squared) для оценки смещения и дисперсии.\n",
|
||
"\n",
|
||
"Оценка смещения и дисперсии для задачи классификации:\n",
|
||
"Для задачи классификации мы будем использовать метрики Accuracy, Precision, Recall и F1-score для оценки смещения и дисперсии.\n",
|
||
"\n",
|
||
"Пример кода для оценки смещения и дисперсии:"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 77,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Оценка смещения и дисперсии для задачи регрессии:\n",
|
||
"Model: Linear Regression\n",
|
||
"MAE (Cross-Validation): Mean = 214.80552977981765, Std = 10.606512171542404\n",
|
||
"R² (Cross-Validation): Mean = -0.013983192308878256, Std = 0.013712813782736416\n",
|
||
"\n",
|
||
"Model: Random Forest Regression\n",
|
||
"MAE (Cross-Validation): Mean = 228.04118684047177, Std = 8.752812633688961\n",
|
||
"R² (Cross-Validation): Mean = -0.16773777274246124, Std = 0.07089525362334798\n",
|
||
"\n",
|
||
"Model: Gradient Boosting Regression\n",
|
||
"MAE (Cross-Validation): Mean = 223.01691070195233, Std = 7.525579341977898\n",
|
||
"R² (Cross-Validation): Mean = -0.1007213971850566, Std = 0.039722456407795335\n",
|
||
"\n",
|
||
"Оценка смещения и дисперсии для задачи классификации:\n",
|
||
"Model: Logistic Regression\n",
|
||
"Accuracy (Cross-Validation): Mean = 0.5055307262569833, Std = 0.03499561917769727\n",
|
||
"Precision (Cross-Validation): Mean = 0.5065468552510806, Std = 0.054654647753909255\n",
|
||
"Recall (Cross-Validation): Mean = 0.36069969356486214, Std = 0.041986149284426406\n",
|
||
"F1-score (Cross-Validation): Mean = 0.41699563277139867, Std = 0.022647838103859376\n",
|
||
"\n",
|
||
"Model: Random Forest Classification\n",
|
||
"Accuracy (Cross-Validation): Mean = 0.47995654872749843, Std = 0.02347679112801281\n",
|
||
"Precision (Cross-Validation): Mean = 0.4767585025913199, Std = 0.027370716762614142\n",
|
||
"Recall (Cross-Validation): Mean = 0.44468845760980596, Std = 0.05499181588361489\n",
|
||
"F1-score (Cross-Validation): Mean = 0.47024453023626417, Std = 0.04502303274822186\n",
|
||
"\n",
|
||
"Model: Gradient Boosting Classification\n",
|
||
"Accuracy (Cross-Validation): Mean = 0.5178895096213532, Std = 0.027332603426564073\n",
|
||
"Precision (Cross-Validation): Mean = 0.5084858601973745, Std = 0.022621295137266188\n",
|
||
"Recall (Cross-Validation): Mean = 0.49668028600612874, Std = 0.04700469023993552\n",
|
||
"F1-score (Cross-Validation): Mean = 0.5055891495803455, Std = 0.03687694960165771\n",
|
||
"\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"from sklearn.model_selection import cross_val_score\n",
|
||
"from sklearn.preprocessing import StandardScaler\n",
|
||
"from sklearn.linear_model import LinearRegression, LogisticRegression\n",
|
||
"from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n",
|
||
"from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier\n",
|
||
"from sklearn.pipeline import Pipeline\n",
|
||
"from sklearn.compose import ColumnTransformer\n",
|
||
"from sklearn.preprocessing import OneHotEncoder\n",
|
||
"\n",
|
||
"# Загружаем набор данных\n",
|
||
"df = pd.read_csv(\".//static//csv//Stores.csv\")\n",
|
||
"\n",
|
||
"# Определяем категориальные и числовые столбцы\n",
|
||
"\n",
|
||
"numerical_cols = [\"Store_Area\", \"Items_Available\", \"Store_Sales\"]\n",
|
||
"\n",
|
||
"# Создаем преобразователь для категориальных и числовых столбцов\n",
|
||
"preprocessor = ColumnTransformer(\n",
|
||
" transformers=[\n",
|
||
" ('num', StandardScaler(), numerical_cols)\n",
|
||
" ])\n",
|
||
"\n",
|
||
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи регрессии\n",
|
||
"X_reg = df[numerical_cols]\n",
|
||
"y_reg = df['Daily_Customer_Count']\n",
|
||
"\n",
|
||
"# Список моделей для задачи регрессии\n",
|
||
"models_reg = {\n",
|
||
" \"Linear Regression\": LinearRegression(),\n",
|
||
" \"Random Forest Regression\": RandomForestRegressor(),\n",
|
||
" \"Gradient Boosting Regression\": GradientBoostingRegressor()\n",
|
||
"}\n",
|
||
"\n",
|
||
"# Оценка смещения и дисперсии для задачи регрессии\n",
|
||
"print(\"Оценка смещения и дисперсии для задачи регрессии:\")\n",
|
||
"for name, model in models_reg.items():\n",
|
||
" pipeline = Pipeline(steps=[\n",
|
||
" ('preprocessor', preprocessor),\n",
|
||
" ('model', model)\n",
|
||
" ])\n",
|
||
" mae_scores = -cross_val_score(pipeline, X_reg, y_reg, cv=5, scoring='neg_mean_absolute_error')\n",
|
||
" r2_scores = cross_val_score(pipeline, X_reg, y_reg, cv=5, scoring='r2')\n",
|
||
" print(f\"Model: {name}\")\n",
|
||
" print(f\"MAE (Cross-Validation): Mean = {mae_scores.mean()}, Std = {mae_scores.std()}\")\n",
|
||
" print(f\"R² (Cross-Validation): Mean = {r2_scores.mean()}, Std = {r2_scores.std()}\")\n",
|
||
" print()\n",
|
||
"\n",
|
||
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи классификации\n",
|
||
"X_class = df[numerical_cols]\n",
|
||
"y_class = (df['Daily_Customer_Count'] > df['Daily_Customer_Count'].mean()).astype(int)\n",
|
||
"\n",
|
||
"# Список моделей для задачи классификации\n",
|
||
"models_class = {\n",
|
||
" \"Logistic Regression\": LogisticRegression(),\n",
|
||
" \"Random Forest Classification\": RandomForestClassifier(),\n",
|
||
" \"Gradient Boosting Classification\": GradientBoostingClassifier()\n",
|
||
"}\n",
|
||
"\n",
|
||
"# Оценка смещения и дисперсии для задачи классификации\n",
|
||
"print(\"Оценка смещения и дисперсии для задачи классификации:\")\n",
|
||
"for name, model in models_class.items():\n",
|
||
" pipeline = Pipeline(steps=[\n",
|
||
" ('preprocessor', preprocessor),\n",
|
||
" ('model', model)\n",
|
||
" ])\n",
|
||
" accuracy_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='accuracy')\n",
|
||
" precision_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='precision')\n",
|
||
" recall_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='recall')\n",
|
||
" f1_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='f1')\n",
|
||
" print(f\"Model: {name}\")\n",
|
||
" print(f\"Accuracy (Cross-Validation): Mean = {accuracy_scores.mean()}, Std = {accuracy_scores.std()}\")\n",
|
||
" print(f\"Precision (Cross-Validation): Mean = {precision_scores.mean()}, Std = {precision_scores.std()}\")\n",
|
||
" print(f\"Recall (Cross-Validation): Mean = {recall_scores.mean()}, Std = {recall_scores.std()}\")\n",
|
||
" print(f\"F1-score (Cross-Validation): Mean = {f1_scores.mean()}, Std = {f1_scores.std()}\")\n",
|
||
" print()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"image/png": "",
|
||
"text/plain": [
|
||
"<Figure size 1200x600 with 2 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"import matplotlib.pyplot as plt\n",
|
||
"from sklearn.model_selection import cross_val_score\n",
|
||
"from sklearn.preprocessing import StandardScaler\n",
|
||
"from sklearn.linear_model import LinearRegression, LogisticRegression\n",
|
||
"from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n",
|
||
"from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier\n",
|
||
"from sklearn.pipeline import Pipeline\n",
|
||
"from sklearn.compose import ColumnTransformer\n",
|
||
"from sklearn.preprocessing import OneHotEncoder\n",
|
||
"\n",
|
||
"# Загружаем набор данных\n",
|
||
"df = pd.read_csv(\".//static//csv//Stores.csv\")\n",
|
||
"\n",
|
||
"# Определяем категориальные и числовые столбцы\n",
|
||
"numerical_cols = [\"Store_Area\", \"Items_Available\", \"Store_Sales\"]\n",
|
||
"\n",
|
||
"# Создаем преобразователь для категориальных и числовых столбцов\n",
|
||
"preprocessor = ColumnTransformer(\n",
|
||
" transformers=[\n",
|
||
" ('num', StandardScaler(), numerical_cols)\n",
|
||
" ])\n",
|
||
"\n",
|
||
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи регрессии\n",
|
||
"X_reg = df[numerical_cols]\n",
|
||
"y_reg = df['Daily_Customer_Count']\n",
|
||
"\n",
|
||
"# Список моделей для задачи регрессии\n",
|
||
"models_reg = {\n",
|
||
" \"Linear Regression\": LinearRegression(),\n",
|
||
" \"Random Forest Regression\": RandomForestRegressor(),\n",
|
||
" \"Gradient Boosting Regression\": GradientBoostingRegressor()\n",
|
||
"}\n",
|
||
"\n",
|
||
"# Оценка смещения и дисперсии для задачи регрессии\n",
|
||
"mae_means = []\n",
|
||
"mae_stds = []\n",
|
||
"r2_means = []\n",
|
||
"r2_stds = []\n",
|
||
"\n",
|
||
"for name, model in models_reg.items():\n",
|
||
" pipeline = Pipeline(steps=[\n",
|
||
" ('preprocessor', preprocessor),\n",
|
||
" ('model', model)\n",
|
||
" ])\n",
|
||
" mae_scores = -cross_val_score(pipeline, X_reg, y_reg, cv=5, scoring='neg_mean_absolute_error')\n",
|
||
" r2_scores = cross_val_score(pipeline, X_reg, y_reg, cv=5, scoring='r2')\n",
|
||
" mae_means.append(mae_scores.mean())\n",
|
||
" mae_stds.append(mae_scores.std())\n",
|
||
" r2_means.append(r2_scores.mean())\n",
|
||
" r2_stds.append(r2_scores.std())\n",
|
||
"\n",
|
||
"# Визуализация результатов для задачи регрессии\n",
|
||
"fig, ax = plt.subplots(1, 2, figsize=(12, 6))\n",
|
||
"\n",
|
||
"ax[0].bar(models_reg.keys(), mae_means, yerr=mae_stds, align='center', alpha=0.5, ecolor='black', capsize=10)\n",
|
||
"ax[0].set_ylabel('MAE')\n",
|
||
"ax[0].set_title('Mean Absolute Error (MAE) for Regression Models')\n",
|
||
"ax[0].yaxis.grid(True)\n",
|
||
"\n",
|
||
"ax[1].bar(models_reg.keys(), r2_means, yerr=r2_stds, align='center', alpha=0.5, ecolor='black', capsize=10)\n",
|
||
"ax[1].set_ylabel('R²')\n",
|
||
"ax[1].set_title('R-squared (R²) for Regression Models')\n",
|
||
"ax[1].yaxis.grid(True)\n",
|
||
"\n",
|
||
"plt.tight_layout()\n",
|
||
"plt.show()\n",
|
||
"\n",
|
||
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи классификации\n",
|
||
"X_class = df[numerical_cols]\n",
|
||
"y_class = (df['Daily_Customer_Count'] > df['Daily_Customer_Count'].mean()).astype(int)\n",
|
||
"\n",
|
||
"# Список моделей для задачи классификации\n",
|
||
"models_class = {\n",
|
||
" \"Logistic Regression\": LogisticRegression(),\n",
|
||
" \"Random Forest Classification\": RandomForestClassifier(),\n",
|
||
" \"Gradient Boosting Classification\": GradientBoostingClassifier()\n",
|
||
"}\n",
|
||
"\n",
|
||
"# Оценка смещения и дисперсии для задачи классификации\n",
|
||
"accuracy_means = []\n",
|
||
"accuracy_stds = []\n",
|
||
"precision_means = []\n",
|
||
"precision_stds = []\n",
|
||
"recall_means = []\n",
|
||
"recall_stds = []\n",
|
||
"f1_means = []\n",
|
||
"f1_stds = []\n",
|
||
"\n",
|
||
"for name, model in models_class.items():\n",
|
||
" pipeline = Pipeline(steps=[\n",
|
||
" ('preprocessor', preprocessor),\n",
|
||
" ('model', model)\n",
|
||
" ])\n",
|
||
" accuracy_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='accuracy')\n",
|
||
" precision_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='precision')\n",
|
||
" recall_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='recall')\n",
|
||
" f1_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='f1')\n",
|
||
" accuracy_means.append(accuracy_scores.mean())\n",
|
||
" accuracy_stds.append(accuracy_scores.std())\n",
|
||
" precision_means.append(precision_scores.mean())\n",
|
||
" precision_stds.append(precision_scores.std())\n",
|
||
" recall_means.append(recall_scores.mean())\n",
|
||
" recall_stds.append(recall_scores.std())\n",
|
||
" f1_means.append(f1_scores.mean())\n",
|
||
" f1_stds.append(f1_scores.std())\n",
|
||
"\n",
|
||
"# Визуализация результатов для задачи классификации\n",
|
||
"fig, ax = plt.subplots(2, 2, figsize=(12, 12))\n",
|
||
"\n",
|
||
"ax[0, 0].bar(models_class.keys(), accuracy_means, yerr=accuracy_stds, align='center', alpha=0.5, ecolor='black', capsize=10)\n",
|
||
"ax[0, 0].set_ylabel('Accuracy')\n",
|
||
"ax[0, 0].set_title('Accuracy for Classification Models')\n",
|
||
"ax[0, 0].yaxis.grid(True)\n",
|
||
"\n",
|
||
"ax[0, 1].bar(models_class.keys(), precision_means, yerr=precision_stds, align='center', alpha=0.5, ecolor='black', capsize=10)\n",
|
||
"ax[0, 1].set_ylabel('Precision')\n",
|
||
"ax[0, 1].set_title('Precision for Classification Models')\n",
|
||
"ax[0, 1].yaxis.grid(True)\n",
|
||
"\n",
|
||
"ax[1, 0].bar(models_class.keys(), recall_means, yerr=recall_stds, align='center', alpha=0.5, ecolor='black', capsize=10)\n",
|
||
"ax[1, 0].set_ylabel('Recall')\n",
|
||
"ax[1, 0].set_title('Recall for Classification Models')\n",
|
||
"ax[1, 0].yaxis.grid(True)\n",
|
||
"\n",
|
||
"ax[1, 1].bar(models_class.keys(), f1_means, yerr=f1_stds, align='center', alpha=0.5, ecolor='black', capsize=10)\n",
|
||
"ax[1, 1].set_ylabel('F1-score')\n",
|
||
"ax[1, 1].set_title('F1-score for Classification Models')\n",
|
||
"ax[1, 1].yaxis.grid(True)\n",
|
||
"\n",
|
||
"plt.tight_layout()\n",
|
||
"plt.show()"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "aisenv",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.12.6"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|