1811 lines
424 KiB
Plaintext
1811 lines
424 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Начало лабораторной работы"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 85,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"df = pd.read_csv(\"data/Medical_insurance.csv\")\n",
|
||
"print(df.columns)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### Бизнес-цели"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"1. Прогнозирование стоимости страховых взносов:\n",
|
||
"\n",
|
||
"Цель: Разработать модель, которая будет предсказывать стоимость страховых взносов для новых клиентов на основе их характеристик (возраст, пол, ИМТ, количество детей, статус курения, регион проживания).\n",
|
||
"\n",
|
||
"Применение:\n",
|
||
"Клиенты могут получить представление о примерной стоимости страховки до обращения в компанию.\n",
|
||
"\n",
|
||
"2. Оптимизация тарифной сетки:\n",
|
||
"\n",
|
||
"Цель: Определить оптимальные коэффициенты для различных факторов, влияющих на стоимость страховки (например, возраст, ИМТ, статус курения), чтобы максимизировать прибыль компании при сохранении конкурентоспособных тарифов.\n",
|
||
"\n",
|
||
"Применение:\n",
|
||
"Страховые компании могут использовать эти коэффициенты для корректировки тарифной сетки и повышения эффективности бизнеса."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"1. Прогнозирование стоимости страховых взносов:"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 86,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Среднее значение поля 'charges': 13261.369959046897\n",
|
||
" age sex bmi children smoker region charges \\\n",
|
||
"0 19 female 27.900 0 yes southwest 16884.92400 \n",
|
||
"1 18 male 33.770 1 no southeast 1725.55230 \n",
|
||
"2 28 male 33.000 3 no southeast 4449.46200 \n",
|
||
"3 33 male 22.705 0 no northwest 21984.47061 \n",
|
||
"4 32 male 28.880 0 no northwest 3866.85520 \n",
|
||
"\n",
|
||
" above_average_charges charges_volatility \n",
|
||
"0 1 62648.55411 \n",
|
||
"1 0 62648.55411 \n",
|
||
"2 0 62648.55411 \n",
|
||
"3 1 62648.55411 \n",
|
||
"4 0 62648.55411 \n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"\n",
|
||
"# Загружаем набор данных\n",
|
||
"df = pd.read_csv(\"data/Medical_insurance.csv\")\n",
|
||
"\n",
|
||
"# Устанавливаем случайное состояние\n",
|
||
"random_state = 42\n",
|
||
"\n",
|
||
"# Рассчитываем среднее значение стоимости страховых взносов\n",
|
||
"average_charges = df['charges'].mean()\n",
|
||
"print(f\"Среднее значение поля 'charges': {average_charges}\")\n",
|
||
"\n",
|
||
"# Создаем новую переменную, указывающую, превышает ли стоимость страховых взносов среднюю\n",
|
||
"df['above_average_charges'] = (df['charges'] > average_charges).astype(int)\n",
|
||
"\n",
|
||
"# Рассчитываем волатильность (разницу между максимальной и минимальной стоимостью страховых взносов)\n",
|
||
"df['charges_volatility'] = df['charges'].max() - df['charges'].min()\n",
|
||
"\n",
|
||
"# Выводим первые строки измененной таблицы для проверки\n",
|
||
"print(df.head())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"2. Оптимизация тарифной сетки:"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 87,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Средняя стоимость страховых взносов для 'age':\n",
|
||
"age\n",
|
||
"18 6714.267794\n",
|
||
"19 9634.641344\n",
|
||
"20 10159.697736\n",
|
||
"21 5349.737625\n",
|
||
"22 10675.132648\n",
|
||
"23 12050.721224\n",
|
||
"24 10648.015962\n",
|
||
"25 9610.781531\n",
|
||
"26 5955.403311\n",
|
||
"27 13130.462272\n",
|
||
"28 8757.474523\n",
|
||
"29 10430.158727\n",
|
||
"30 13580.480238\n",
|
||
"31 10196.980573\n",
|
||
"32 10071.740266\n",
|
||
"33 12118.482617\n",
|
||
"34 11613.528121\n",
|
||
"35 11307.182031\n",
|
||
"36 12204.476138\n",
|
||
"37 17595.511688\n",
|
||
"38 8102.733674\n",
|
||
"39 11468.895088\n",
|
||
"40 11772.251310\n",
|
||
"41 9533.603123\n",
|
||
"42 13061.038669\n",
|
||
"43 19267.278653\n",
|
||
"44 16439.727524\n",
|
||
"45 14404.055995\n",
|
||
"46 14201.069951\n",
|
||
"47 18153.128652\n",
|
||
"48 14632.500445\n",
|
||
"49 12696.006264\n",
|
||
"50 15663.003301\n",
|
||
"51 15452.800438\n",
|
||
"52 18951.581034\n",
|
||
"53 15795.645012\n",
|
||
"54 18252.834139\n",
|
||
"55 16164.545488\n",
|
||
"56 14727.018377\n",
|
||
"57 16283.671944\n",
|
||
"58 13815.290525\n",
|
||
"59 18639.637165\n",
|
||
"60 21979.418507\n",
|
||
"61 22024.457609\n",
|
||
"62 18926.646066\n",
|
||
"63 19884.998461\n",
|
||
"64 24419.101775\n",
|
||
"Name: charges, dtype: float64\n",
|
||
"\n",
|
||
"Средняя стоимость страховых взносов для 'sex':\n",
|
||
"sex\n",
|
||
"female 12486.831977\n",
|
||
"male 14013.872721\n",
|
||
"Name: charges, dtype: float64\n",
|
||
"\n",
|
||
"Средняя стоимость страховых взносов для 'bmi':\n",
|
||
"bmi\n",
|
||
"15.960 1694.796400\n",
|
||
"16.815 4904.000350\n",
|
||
"17.195 14455.644050\n",
|
||
"17.290 7813.353433\n",
|
||
"17.385 2775.192150\n",
|
||
" ... \n",
|
||
"48.070 9432.925300\n",
|
||
"49.060 11381.325400\n",
|
||
"50.380 2438.055200\n",
|
||
"52.580 44501.398200\n",
|
||
"53.130 1163.462700\n",
|
||
"Name: charges, Length: 548, dtype: float64\n",
|
||
"\n",
|
||
"Средняя стоимость страховых взносов для 'children':\n",
|
||
"children\n",
|
||
"0 12317.920881\n",
|
||
"1 12722.650521\n",
|
||
"2 15268.182723\n",
|
||
"3 15304.070620\n",
|
||
"4 13550.983876\n",
|
||
"5 8706.036629\n",
|
||
"Name: charges, dtype: float64\n",
|
||
"\n",
|
||
"Средняя стоимость страховых взносов для 'smoker':\n",
|
||
"smoker\n",
|
||
"no 8417.874411\n",
|
||
"yes 32223.139764\n",
|
||
"Name: charges, dtype: float64\n",
|
||
"\n",
|
||
"Средняя стоимость страховых взносов для 'region':\n",
|
||
"region\n",
|
||
"northeast 13475.874737\n",
|
||
"northwest 12463.129315\n",
|
||
"southeast 14748.777706\n",
|
||
"southwest 12164.196435\n",
|
||
"Name: charges, dtype: float64\n",
|
||
"\n",
|
||
"Средняя стоимость страховых взносов для комбинации 'age' и 'smoker':\n",
|
||
"age smoker\n",
|
||
"18 no 3083.404099\n",
|
||
" yes 25473.730221\n",
|
||
"19 no 3492.047133\n",
|
||
" yes 26445.951817\n",
|
||
"20 no 3673.112925\n",
|
||
" ... \n",
|
||
"62 yes 37084.607312\n",
|
||
"63 no 14205.335706\n",
|
||
" yes 40331.784380\n",
|
||
"64 no 15805.350545\n",
|
||
" yes 40569.885331\n",
|
||
"Name: charges, Length: 94, dtype: float64\n",
|
||
"\n",
|
||
"Средняя стоимость страховых взносов для комбинации 'bmi' и 'smoker':\n",
|
||
"bmi smoker\n",
|
||
"15.960 no 1694.79640\n",
|
||
"16.815 no 4904.00035\n",
|
||
"17.195 yes 14455.64405\n",
|
||
"17.290 no 5305.30260\n",
|
||
" yes 12829.45510\n",
|
||
" ... \n",
|
||
"48.070 no 9432.92530\n",
|
||
"49.060 no 11381.32540\n",
|
||
"50.380 no 2438.05520\n",
|
||
"52.580 yes 44501.39820\n",
|
||
"53.130 no 1163.46270\n",
|
||
"Name: charges, Length: 701, dtype: float64\n",
|
||
"\n",
|
||
"Средняя стоимость страховых взносов для комбинации 'region' и 'smoker':\n",
|
||
"region smoker\n",
|
||
"northeast no 9225.395851\n",
|
||
" yes 29790.212814\n",
|
||
"northwest no 8681.948181\n",
|
||
" yes 29959.103039\n",
|
||
"southeast no 7887.181702\n",
|
||
" yes 35262.090761\n",
|
||
"southwest no 7956.579615\n",
|
||
" yes 32346.494062\n",
|
||
"Name: charges, dtype: float64\n",
|
||
"\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"\n",
|
||
"# Загружаем набор данных\n",
|
||
"df = pd.read_csv(\"data/Medical_insurance.csv\")\n",
|
||
"\n",
|
||
"# Рассчитываем среднюю стоимость страховых взносов для каждого значения каждого признака\n",
|
||
"for column in ['age', 'sex', 'bmi', 'children', 'smoker', 'region']:\n",
|
||
" print(f\"Средняя стоимость страховых взносов для '{column}':\")\n",
|
||
" print(df.groupby(column)['charges'].mean())\n",
|
||
" print()\n",
|
||
"\n",
|
||
"# Рассчитываем среднюю стоимость страховых взносов для комбинаций признаков\n",
|
||
"# для комбинации 'age' и 'smoker'\n",
|
||
"print(\"Средняя стоимость страховых взносов для комбинации 'age' и 'smoker':\")\n",
|
||
"print(df.groupby(['age', 'smoker'])['charges'].mean())\n",
|
||
"print()\n",
|
||
"\n",
|
||
"# Рассчитываем среднюю стоимость страховых взносов для комбинации 'bmi' и 'smoker'\n",
|
||
"print(\"Средняя стоимость страховых взносов для комбинации 'bmi' и 'smoker':\")\n",
|
||
"print(df.groupby(['bmi', 'smoker'])['charges'].mean())\n",
|
||
"print()\n",
|
||
"\n",
|
||
"# Рассчитываем среднюю стоимость страховых взносов для комбинации 'region' и 'smoker'\n",
|
||
"print(\"Средняя стоимость страховых взносов для комбинации 'region' и 'smoker':\")\n",
|
||
"print(df.groupby(['region', 'smoker'])['charges'].mean())\n",
|
||
"print()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 88,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Пропуски не обнаружены.\n",
|
||
"MAE: 4160.247974762991\n",
|
||
"MSE: 39933194.54805147\n",
|
||
"RMSE: 6319.271678607549\n",
|
||
"R²: 0.73981661775643\n",
|
||
"Ориентиры для прогнозирования стоимости страховых взносов не достигнуты.\n",
|
||
"Средняя стоимость страховых взносов для 'age':\n",
|
||
"age\n",
|
||
"18 6714.267794\n",
|
||
"19 9634.641344\n",
|
||
"20 10159.697736\n",
|
||
"21 5349.737625\n",
|
||
"22 10675.132648\n",
|
||
"23 12050.721224\n",
|
||
"24 10648.015962\n",
|
||
"25 9610.781531\n",
|
||
"26 5955.403311\n",
|
||
"27 13130.462272\n",
|
||
"28 8757.474523\n",
|
||
"29 10430.158727\n",
|
||
"30 13580.480238\n",
|
||
"31 10196.980573\n",
|
||
"32 10071.740266\n",
|
||
"33 12118.482617\n",
|
||
"34 11613.528121\n",
|
||
"35 11307.182031\n",
|
||
"36 12204.476138\n",
|
||
"37 17595.511688\n",
|
||
"38 8102.733674\n",
|
||
"39 11468.895088\n",
|
||
"40 11772.251310\n",
|
||
"41 9533.603123\n",
|
||
"42 13061.038669\n",
|
||
"43 19267.278653\n",
|
||
"44 16439.727524\n",
|
||
"45 14404.055995\n",
|
||
"46 14201.069951\n",
|
||
"47 18153.128652\n",
|
||
"48 14632.500445\n",
|
||
"49 12696.006264\n",
|
||
"50 15663.003301\n",
|
||
"51 15452.800438\n",
|
||
"52 18951.581034\n",
|
||
"53 15795.645012\n",
|
||
"54 18252.834139\n",
|
||
"55 16164.545488\n",
|
||
"56 14727.018377\n",
|
||
"57 16283.671944\n",
|
||
"58 13815.290525\n",
|
||
"59 18639.637165\n",
|
||
"60 21979.418507\n",
|
||
"61 22024.457609\n",
|
||
"62 18926.646066\n",
|
||
"63 19884.998461\n",
|
||
"64 24419.101775\n",
|
||
"Name: charges, dtype: float64\n",
|
||
"\n",
|
||
"Средняя стоимость страховых взносов для 'bmi':\n",
|
||
"bmi\n",
|
||
"15.960 1694.796400\n",
|
||
"16.815 4904.000350\n",
|
||
"17.195 14455.644050\n",
|
||
"17.290 7813.353433\n",
|
||
"17.385 2775.192150\n",
|
||
" ... \n",
|
||
"48.070 9432.925300\n",
|
||
"49.060 11381.325400\n",
|
||
"50.380 2438.055200\n",
|
||
"52.580 44501.398200\n",
|
||
"53.130 1163.462700\n",
|
||
"Name: charges, Length: 548, dtype: float64\n",
|
||
"\n",
|
||
"Средняя стоимость страховых взносов для 'children':\n",
|
||
"children\n",
|
||
"0 12317.920881\n",
|
||
"1 12722.650521\n",
|
||
"2 15268.182723\n",
|
||
"3 15304.070620\n",
|
||
"4 13550.983876\n",
|
||
"5 8706.036629\n",
|
||
"Name: charges, dtype: float64\n",
|
||
"\n",
|
||
"Средняя стоимость страховых взносов для 'sex_male':\n",
|
||
"sex_male\n",
|
||
"False 12486.831977\n",
|
||
"True 14013.872721\n",
|
||
"Name: charges, dtype: float64\n",
|
||
"\n",
|
||
"Средняя стоимость страховых взносов для 'smoker_yes':\n",
|
||
"smoker_yes\n",
|
||
"False 8417.874411\n",
|
||
"True 32223.139764\n",
|
||
"Name: charges, dtype: float64\n",
|
||
"\n",
|
||
"Средняя стоимость страховых взносов для 'region_northwest':\n",
|
||
"region_northwest\n",
|
||
"False 13512.808188\n",
|
||
"True 12463.129315\n",
|
||
"Name: charges, dtype: float64\n",
|
||
"\n",
|
||
"Средняя стоимость страховых взносов для 'region_southeast':\n",
|
||
"region_southeast\n",
|
||
"False 12693.396712\n",
|
||
"True 14748.777706\n",
|
||
"Name: charges, dtype: float64\n",
|
||
"\n",
|
||
"Средняя стоимость страховых взносов для 'region_southwest':\n",
|
||
"region_southwest\n",
|
||
"False 13620.788872\n",
|
||
"True 12164.196435\n",
|
||
"Name: charges, dtype: float64\n",
|
||
"\n",
|
||
"Средняя стоимость страховых взносов для комбинации 'age' и 'smoker_yes':\n",
|
||
"age smoker_yes\n",
|
||
"18 False 3083.404099\n",
|
||
" True 25473.730221\n",
|
||
"19 False 3492.047133\n",
|
||
" True 26445.951817\n",
|
||
"20 False 3673.112925\n",
|
||
" ... \n",
|
||
"62 True 37084.607312\n",
|
||
"63 False 14205.335706\n",
|
||
" True 40331.784380\n",
|
||
"64 False 15805.350545\n",
|
||
" True 40569.885331\n",
|
||
"Name: charges, Length: 94, dtype: float64\n",
|
||
"\n",
|
||
"Средняя стоимость страховых взносов для комбинации 'bmi' и 'smoker_yes':\n",
|
||
"bmi smoker_yes\n",
|
||
"15.960 False 1694.79640\n",
|
||
"16.815 False 4904.00035\n",
|
||
"17.195 True 14455.64405\n",
|
||
"17.290 False 5305.30260\n",
|
||
" True 12829.45510\n",
|
||
" ... \n",
|
||
"48.070 False 9432.92530\n",
|
||
"49.060 False 11381.32540\n",
|
||
"50.380 False 2438.05520\n",
|
||
"52.580 True 44501.39820\n",
|
||
"53.130 False 1163.46270\n",
|
||
"Name: charges, Length: 701, dtype: float64\n",
|
||
"\n",
|
||
"Средняя стоимость страховых взносов для комбинации 'region_northwest' и 'smoker_yes':\n",
|
||
"region_northwest smoker_yes\n",
|
||
"False False 8331.120934\n",
|
||
" True 32822.144996\n",
|
||
"True False 8681.948181\n",
|
||
" True 29959.103039\n",
|
||
"Name: charges, dtype: float64\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"c:\\Users\\Админ\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
||
" warnings.warn(\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"from sklearn.model_selection import train_test_split\n",
|
||
"from sklearn.preprocessing import StandardScaler\n",
|
||
"from sklearn.linear_model import LinearRegression\n",
|
||
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
|
||
"from imblearn.over_sampling import SMOTE\n",
|
||
"\n",
|
||
"# Загружаем набор данных\n",
|
||
"df = pd.read_csv(\"data/Medical_insurance.csv\")\n",
|
||
"\n",
|
||
"# Проверка наличия пропусков\n",
|
||
"if df.isnull().sum().any():\n",
|
||
" print(\"Пропуски обнаружены в следующих столбцах:\")\n",
|
||
" print(df.isnull().sum())\n",
|
||
"else:\n",
|
||
" print(\"Пропуски не обнаружены.\")\n",
|
||
"\n",
|
||
"# Преобразуем категориальные переменные в числовые\n",
|
||
"df = pd.get_dummies(df, columns=['sex', 'smoker', 'region'], drop_first=True)\n",
|
||
"\n",
|
||
"# Разделяем данные на признаки (X) и целевую переменную (y)\n",
|
||
"X = df.drop('charges', axis=1)\n",
|
||
"y = df['charges']\n",
|
||
"\n",
|
||
"# Разделяем данные на обучающую и тестовую выборки\n",
|
||
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
|
||
"\n",
|
||
"# Стандартизируем признаки\n",
|
||
"scaler = StandardScaler()\n",
|
||
"X_train = scaler.fit_transform(X_train)\n",
|
||
"X_test = scaler.transform(X_test)\n",
|
||
"\n",
|
||
"# Обучаем модель линейной регрессии\n",
|
||
"model = LinearRegression()\n",
|
||
"model.fit(X_train, y_train)\n",
|
||
"\n",
|
||
"# Делаем предсказания на тестовой выборке\n",
|
||
"y_pred = model.predict(X_test)\n",
|
||
"\n",
|
||
"# Оцениваем качество модели\n",
|
||
"mae = mean_absolute_error(y_test, y_pred)\n",
|
||
"mse = mean_squared_error(y_test, y_pred)\n",
|
||
"rmse = mean_squared_error(y_test, y_pred, squared=False)\n",
|
||
"r2 = r2_score(y_test, y_pred)\n",
|
||
"\n",
|
||
"print(f\"MAE: {mae}\")\n",
|
||
"print(f\"MSE: {mse}\")\n",
|
||
"print(f\"RMSE: {rmse}\")\n",
|
||
"print(f\"R²: {r2}\")\n",
|
||
"\n",
|
||
"# Проверяем, достигнуты ли ориентиры\n",
|
||
"if r2 >= 0.75 and mae <= 3000 and rmse <= 5000:\n",
|
||
" print(\"Ориентиры для прогнозирования стоимости страховых взносов достигнуты!\")\n",
|
||
"else:\n",
|
||
" print(\"Ориентиры для прогнозирования стоимости страховых взносов не достигнуты.\")\n",
|
||
"\n",
|
||
"# Оптимизация тарифной сетки\n",
|
||
"# Убедитесь, что столбцы существуют\n",
|
||
"columns_to_group = ['age', 'bmi', 'children', 'sex_male', 'smoker_yes', 'region_northwest', 'region_southeast', 'region_southwest']\n",
|
||
"\n",
|
||
"# Рассчитываем среднюю стоимость страховых взносов для каждого значения каждого признака\n",
|
||
"for column in columns_to_group:\n",
|
||
" print(f\"Средняя стоимость страховых взносов для '{column}':\")\n",
|
||
" print(df.groupby(column)['charges'].mean())\n",
|
||
" print()\n",
|
||
"\n",
|
||
"# Рассчитываем среднюю стоимость страховых взносов для комбинаций признаков\n",
|
||
"# Например, для комбинации 'age' и 'smoker_yes'\n",
|
||
"print(\"Средняя стоимость страховых взносов для комбинации 'age' и 'smoker_yes':\")\n",
|
||
"print(df.groupby(['age', 'smoker_yes'])['charges'].mean())\n",
|
||
"print()\n",
|
||
"\n",
|
||
"# Рассчитываем среднюю стоимость страховых взносов для комбинации 'bmi' и 'smoker_yes'\n",
|
||
"print(\"Средняя стоимость страховых взносов для комбинации 'bmi' и 'smoker_yes':\")\n",
|
||
"print(df.groupby(['bmi', 'smoker_yes'])['charges'].mean())\n",
|
||
"print()\n",
|
||
"\n",
|
||
"# Рассчитываем среднюю стоимость страховых взносов для комбинации 'region_northwest' и 'smoker_yes'\n",
|
||
"print(\"Средняя стоимость страховых взносов для комбинации 'region_northwest' и 'smoker_yes':\")\n",
|
||
"print(df.groupby(['region_northwest', 'smoker_yes'])['charges'].mean())\n",
|
||
"print()\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 89,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Результаты для задачи регрессии:\n",
|
||
"Model: Linear Regression\n",
|
||
"MAE: 4160.247974762991\n",
|
||
"MSE: 39933194.54805147\n",
|
||
"RMSE: 6319.271678607549\n",
|
||
"R²: 0.73981661775643\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"c:\\Users\\Админ\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
||
" warnings.warn(\n",
|
||
"c:\\Users\\Админ\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
||
" warnings.warn(\n",
|
||
"c:\\Users\\Админ\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
||
" warnings.warn(\n",
|
||
"c:\\Users\\Админ\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
||
" warnings.warn(\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Model: Random Forest Regression\n",
|
||
"MAE: 1305.6051789457651\n",
|
||
"MSE: 7520094.230349512\n",
|
||
"RMSE: 2742.279021243008\n",
|
||
"R²: 0.9510030796737707\n",
|
||
"\n",
|
||
"Model: Gradient Boosting Regression\n",
|
||
"MAE: 2297.7789526178267\n",
|
||
"MSE: 19231434.895688985\n",
|
||
"RMSE: 4385.365993356653\n",
|
||
"R²: 0.8746982345593102\n",
|
||
"\n",
|
||
"Результаты для задачи классификации:\n",
|
||
"Model: Logistic Regression\n",
|
||
"Accuracy: 0.8864864864864865\n",
|
||
"MAE: 0.1723401794364032\n",
|
||
"MSE: 0.08904020819558058\n",
|
||
"RMSE: 0.29839605928292784\n",
|
||
"R²: 0.5752685271247916\n",
|
||
"\n",
|
||
"Model: Random Forest Classification\n",
|
||
"Accuracy: 0.9765765765765766\n",
|
||
"\n",
|
||
"Model: Gradient Boosting Classification\n",
|
||
"Accuracy: 0.9225225225225225\n",
|
||
"\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"from sklearn.model_selection import train_test_split\n",
|
||
"from sklearn.preprocessing import StandardScaler\n",
|
||
"from sklearn.linear_model import LinearRegression, LogisticRegression\n",
|
||
"from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n",
|
||
"from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier\n",
|
||
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score\n",
|
||
"\n",
|
||
"# Загружаем набор данных\n",
|
||
"df = pd.read_csv(\"data/Medical_insurance.csv\")\n",
|
||
"\n",
|
||
"# Преобразуем категориальные переменные в числовые\n",
|
||
"df = pd.get_dummies(df, columns=['sex', 'smoker', 'region'], drop_first=True)\n",
|
||
"\n",
|
||
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи регрессии\n",
|
||
"X_reg = df.drop('charges', axis=1)\n",
|
||
"y_reg = df['charges']\n",
|
||
"\n",
|
||
"# Разделяем данные на обучающую и тестовую выборки для задачи регрессии\n",
|
||
"X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)\n",
|
||
"\n",
|
||
"# Стандартизируем признаки для задачи регрессии\n",
|
||
"scaler_reg = StandardScaler()\n",
|
||
"X_train_reg = scaler_reg.fit_transform(X_train_reg)\n",
|
||
"X_test_reg = scaler_reg.transform(X_test_reg)\n",
|
||
"\n",
|
||
"# Список моделей для задачи регрессии\n",
|
||
"models_reg = {\n",
|
||
" \"Linear Regression\": LinearRegression(),\n",
|
||
" \"Random Forest Regression\": RandomForestRegressor(),\n",
|
||
" \"Gradient Boosting Regression\": GradientBoostingRegressor()\n",
|
||
"}\n",
|
||
"\n",
|
||
"# Обучаем и оцениваем модели для задачи регрессии\n",
|
||
"print(\"Результаты для задачи регрессии:\")\n",
|
||
"for name, model in models_reg.items():\n",
|
||
" model.fit(X_train_reg, y_train_reg)\n",
|
||
" y_pred_reg = model.predict(X_test_reg)\n",
|
||
" mae = mean_absolute_error(y_test_reg, y_pred_reg)\n",
|
||
" mse = mean_squared_error(y_test_reg, y_pred_reg)\n",
|
||
" rmse = mean_squared_error(y_test_reg, y_pred_reg, squared=False)\n",
|
||
" r2 = r2_score(y_test_reg, y_pred_reg)\n",
|
||
" print(f\"Model: {name}\")\n",
|
||
" print(f\"MAE: {mae}\")\n",
|
||
" print(f\"MSE: {mse}\")\n",
|
||
" print(f\"RMSE: {rmse}\")\n",
|
||
" print(f\"R²: {r2}\")\n",
|
||
" print()\n",
|
||
"\n",
|
||
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи классификации\n",
|
||
"X_class = df.drop('charges', axis=1)\n",
|
||
"y_class = (df['charges'] > df['charges'].mean()).astype(int)\n",
|
||
"\n",
|
||
"# Разделяем данные на обучающую и тестовую выборки для задачи классификации\n",
|
||
"X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)\n",
|
||
"\n",
|
||
"# Стандартизируем признаки для задачи классификации\n",
|
||
"scaler_class = StandardScaler()\n",
|
||
"X_train_class = scaler_class.fit_transform(X_train_class)\n",
|
||
"X_test_class = scaler_class.transform(X_test_class)\n",
|
||
"\n",
|
||
"# Список моделей для задачи классификации\n",
|
||
"models_class = {\n",
|
||
" \"Logistic Regression\": LogisticRegression(max_iter=1000),\n",
|
||
" \"Random Forest Classification\": RandomForestClassifier(),\n",
|
||
" \"Gradient Boosting Classification\": GradientBoostingClassifier()\n",
|
||
"}\n",
|
||
"\n",
|
||
"# Обучаем и оцениваем модели для задачи классификации\n",
|
||
"print(\"Результаты для задачи классификации:\")\n",
|
||
"for name, model in models_class.items():\n",
|
||
" model.fit(X_train_class, y_train_class)\n",
|
||
" y_pred_class = model.predict(X_test_class)\n",
|
||
" accuracy = accuracy_score(y_test_class, y_pred_class)\n",
|
||
" print(f\"Model: {name}\")\n",
|
||
" print(f\"Accuracy: {accuracy}\")\n",
|
||
"\n",
|
||
" # Вычисление вероятностей для логистической регрессии\n",
|
||
" if name == \"Logistic Regression\":\n",
|
||
" y_pred_probs = model.predict_proba(X_test_class)[:, 1]\n",
|
||
" mae = mean_absolute_error(y_test_class, y_pred_probs)\n",
|
||
" mse = mean_squared_error(y_test_class, y_pred_probs)\n",
|
||
" rmse = mean_squared_error(y_test_class, y_pred_probs, squared=False)\n",
|
||
" r2 = r2_score(y_test_class, y_pred_probs)\n",
|
||
" print(f\"MAE: {mae}\")\n",
|
||
" print(f\"MSE: {mse}\")\n",
|
||
" print(f\"RMSE: {rmse}\")\n",
|
||
" print(f\"R²: {r2}\")\n",
|
||
" print()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"1. Прогнозирование стоимости страховых взносов:\n",
|
||
"Конвейер для задачи регрессии:"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 90,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Результаты для задачи регрессии:\n",
|
||
"Model: Linear Regression\n",
|
||
"MAE: 4165.371459891892\n",
|
||
"MSE: 39957124.62099743\n",
|
||
"RMSE: 6321.164815205931\n",
|
||
"R²: 0.7396607021732446\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"c:\\Users\\Админ\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
||
" warnings.warn(\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Model: Random Forest Regression\n",
|
||
"MAE: 1301.4460596254232\n",
|
||
"MSE: 7345804.075966105\n",
|
||
"RMSE: 2710.3143869237947\n",
|
||
"R²: 0.9521386612963394\n",
|
||
"\n",
|
||
"Model: Gradient Boosting Regression\n",
|
||
"MAE: 2304.718628546955\n",
|
||
"MSE: 19256343.733882822\n",
|
||
"RMSE: 4388.205069716184\n",
|
||
"R²: 0.8745359418641633\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"c:\\Users\\Админ\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
||
" warnings.warn(\n",
|
||
"c:\\Users\\Админ\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
||
" warnings.warn(\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"from sklearn.model_selection import train_test_split\n",
|
||
"from sklearn.preprocessing import StandardScaler\n",
|
||
"from sklearn.linear_model import LinearRegression\n",
|
||
"from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor\n",
|
||
"from sklearn.pipeline import Pipeline\n",
|
||
"from sklearn.compose import ColumnTransformer\n",
|
||
"from sklearn.preprocessing import OneHotEncoder\n",
|
||
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
|
||
"\n",
|
||
"# Загружаем набор данных\n",
|
||
"df = pd.read_csv(\"data/Medical_insurance.csv\")\n",
|
||
"\n",
|
||
"# Определяем категориальные и числовые столбцы\n",
|
||
"categorical_cols = ['sex', 'smoker', 'region']\n",
|
||
"numerical_cols = ['age', 'bmi', 'children']\n",
|
||
"\n",
|
||
"# Создаем преобразователь для категориальных и числовых столбцов\n",
|
||
"preprocessor = ColumnTransformer(\n",
|
||
" transformers=[\n",
|
||
" ('cat', OneHotEncoder(), categorical_cols),\n",
|
||
" ('num', StandardScaler(), numerical_cols)\n",
|
||
" ])\n",
|
||
"\n",
|
||
"# Список моделей для задачи регрессии\n",
|
||
"models_reg = {\n",
|
||
" \"Linear Regression\": LinearRegression(),\n",
|
||
" \"Random Forest Regression\": RandomForestRegressor(),\n",
|
||
" \"Gradient Boosting Regression\": GradientBoostingRegressor()\n",
|
||
"}\n",
|
||
"\n",
|
||
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи регрессии\n",
|
||
"X_reg = df[categorical_cols + numerical_cols]\n",
|
||
"y_reg = df['charges']\n",
|
||
"\n",
|
||
"# Разделяем данные на обучающую и тестовую выборки для задачи регрессии\n",
|
||
"X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)\n",
|
||
"\n",
|
||
"# Обучаем и оцениваем модели для задачи регрессии\n",
|
||
"print(\"Результаты для задачи регрессии:\")\n",
|
||
"for name, model in models_reg.items():\n",
|
||
" pipeline = Pipeline(steps=[\n",
|
||
" ('preprocessor', preprocessor),\n",
|
||
" ('model', model)\n",
|
||
" ])\n",
|
||
" pipeline.fit(X_train_reg, y_train_reg)\n",
|
||
" y_pred_reg = pipeline.predict(X_test_reg)\n",
|
||
" mae = mean_absolute_error(y_test_reg, y_pred_reg)\n",
|
||
" mse = mean_squared_error(y_test_reg, y_pred_reg)\n",
|
||
" rmse = mean_squared_error(y_test_reg, y_pred_reg, squared=False)\n",
|
||
" r2 = r2_score(y_test_reg, y_pred_reg)\n",
|
||
" print(f\"Model: {name}\")\n",
|
||
" print(f\"MAE: {mae}\")\n",
|
||
" print(f\"MSE: {mse}\")\n",
|
||
" print(f\"RMSE: {rmse}\")\n",
|
||
" print(f\"R²: {r2}\")\n",
|
||
" print()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"2. Оптимизация тарифной сетки:\n",
|
||
"Конвейер для задачи классификации:"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 91,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Результаты для задачи классификации:\n",
|
||
"Model: Logistic Regression\n",
|
||
"Accuracy: 0.8846846846846846\n",
|
||
"\n",
|
||
"Model: Random Forest Classification\n",
|
||
"Accuracy: 0.9765765765765766\n",
|
||
"\n",
|
||
"Model: Gradient Boosting Classification\n",
|
||
"Accuracy: 0.9243243243243243\n",
|
||
"\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"from sklearn.model_selection import train_test_split\n",
|
||
"from sklearn.preprocessing import StandardScaler\n",
|
||
"from sklearn.linear_model import LogisticRegression\n",
|
||
"from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n",
|
||
"from sklearn.pipeline import Pipeline\n",
|
||
"from sklearn.compose import ColumnTransformer\n",
|
||
"from sklearn.preprocessing import OneHotEncoder\n",
|
||
"from sklearn.metrics import accuracy_score\n",
|
||
"\n",
|
||
"# Загружаем набор данных\n",
|
||
"df = pd.read_csv(\"data/Medical_insurance.csv\")\n",
|
||
"\n",
|
||
"# Определяем категориальные и числовые столбцы\n",
|
||
"categorical_cols = ['sex', 'smoker', 'region']\n",
|
||
"numerical_cols = ['age', 'bmi', 'children']\n",
|
||
"\n",
|
||
"# Создаем преобразователь для категориальных и числовых столбцов\n",
|
||
"preprocessor = ColumnTransformer(\n",
|
||
" transformers=[\n",
|
||
" ('cat', OneHotEncoder(), categorical_cols),\n",
|
||
" ('num', StandardScaler(), numerical_cols)\n",
|
||
" ])\n",
|
||
"\n",
|
||
"# Список моделей для задачи классификации\n",
|
||
"models_class = {\n",
|
||
" \"Logistic Regression\": LogisticRegression(),\n",
|
||
" \"Random Forest Classification\": RandomForestClassifier(),\n",
|
||
" \"Gradient Boosting Classification\": GradientBoostingClassifier()\n",
|
||
"}\n",
|
||
"\n",
|
||
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи классификации\n",
|
||
"X_class = df[categorical_cols + numerical_cols]\n",
|
||
"y_class = (df['charges'] > df['charges'].mean()).astype(int)\n",
|
||
"\n",
|
||
"# Разделяем данные на обучающую и тестовую выборки для задачи классификации\n",
|
||
"X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)\n",
|
||
"\n",
|
||
"# Обучаем и оцениваем модели для задачи классификации\n",
|
||
"print(\"Результаты для задачи классификации:\")\n",
|
||
"for name, model in models_class.items():\n",
|
||
" pipeline = Pipeline(steps=[\n",
|
||
" ('preprocessor', preprocessor),\n",
|
||
" ('model', model)\n",
|
||
" ])\n",
|
||
" pipeline.fit(X_train_class, y_train_class)\n",
|
||
" y_pred_class = pipeline.predict(X_test_class)\n",
|
||
" accuracy = accuracy_score(y_test_class, y_pred_class)\n",
|
||
" print(f\"Model: {name}\")\n",
|
||
" print(f\"Accuracy: {accuracy}\")\n",
|
||
" print()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"1. Прогнозирование стоимости страховых взносов:\n",
|
||
"\n",
|
||
"Настройка гиперпараметров для задачи регрессии:"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 92,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Результаты для задачи регрессии:\n",
|
||
"Model: Linear Regression\n",
|
||
"Best Parameters: {}\n",
|
||
"MAE: 4165.371459891892\n",
|
||
"MSE: 39957124.62099743\n",
|
||
"RMSE: 6321.164815205931\n",
|
||
"R²: 0.7396607021732446\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"c:\\Users\\Админ\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
||
" warnings.warn(\n",
|
||
"c:\\Users\\Админ\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
||
" warnings.warn(\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Model: Random Forest Regression\n",
|
||
"Best Parameters: {'model__max_depth': None, 'model__n_estimators': 200}\n",
|
||
"MAE: 1281.6915217498565\n",
|
||
"MSE: 7387848.666520319\n",
|
||
"RMSE: 2718.059724605094\n",
|
||
"R²: 0.9518647211846292\n",
|
||
"\n",
|
||
"Model: Gradient Boosting Regression\n",
|
||
"Best Parameters: {'model__learning_rate': 0.1, 'model__max_depth': 5, 'model__n_estimators': 200}\n",
|
||
"MAE: 1556.213395665512\n",
|
||
"MSE: 9345361.053511541\n",
|
||
"RMSE: 3057.018327310378\n",
|
||
"R²: 0.9391106152485712\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"c:\\Users\\Админ\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
||
" warnings.warn(\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"from sklearn.model_selection import train_test_split, GridSearchCV\n",
|
||
"from sklearn.preprocessing import StandardScaler\n",
|
||
"from sklearn.linear_model import LinearRegression\n",
|
||
"from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor\n",
|
||
"from sklearn.pipeline import Pipeline\n",
|
||
"from sklearn.compose import ColumnTransformer\n",
|
||
"from sklearn.preprocessing import OneHotEncoder\n",
|
||
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
|
||
"\n",
|
||
"# Загружаем набор данных\n",
|
||
"df = pd.read_csv(\"data/Medical_insurance.csv\")\n",
|
||
"\n",
|
||
"# Определяем категориальные и числовые столбцы\n",
|
||
"categorical_cols = ['sex', 'smoker', 'region']\n",
|
||
"numerical_cols = ['age', 'bmi', 'children']\n",
|
||
"\n",
|
||
"# Создаем преобразователь для категориальных и числовых столбцов\n",
|
||
"preprocessor = ColumnTransformer(\n",
|
||
" transformers=[\n",
|
||
" ('cat', OneHotEncoder(), categorical_cols),\n",
|
||
" ('num', StandardScaler(), numerical_cols)\n",
|
||
" ])\n",
|
||
"\n",
|
||
"# Список моделей и их гиперпараметров для задачи регрессии\n",
|
||
"models_reg = {\n",
|
||
" \"Linear Regression\": (LinearRegression(), {}),\n",
|
||
" \"Random Forest Regression\": (RandomForestRegressor(), {\n",
|
||
" 'model__n_estimators': [100, 200],\n",
|
||
" 'model__max_depth': [None, 10, 20]\n",
|
||
" }),\n",
|
||
" \"Gradient Boosting Regression\": (GradientBoostingRegressor(), {\n",
|
||
" 'model__n_estimators': [100, 200],\n",
|
||
" 'model__learning_rate': [0.01, 0.1],\n",
|
||
" 'model__max_depth': [3, 5]\n",
|
||
" })\n",
|
||
"}\n",
|
||
"\n",
|
||
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи регрессии\n",
|
||
"X_reg = df[categorical_cols + numerical_cols]\n",
|
||
"y_reg = df['charges']\n",
|
||
"\n",
|
||
"# Разделяем данные на обучающую и тестовую выборки для задачи регрессии\n",
|
||
"X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)\n",
|
||
"\n",
|
||
"# Обучаем и оцениваем модели для задачи регрессии\n",
|
||
"print(\"Результаты для задачи регрессии:\")\n",
|
||
"for name, (model, params) in models_reg.items():\n",
|
||
" pipeline = Pipeline(steps=[\n",
|
||
" ('preprocessor', preprocessor),\n",
|
||
" ('model', model)\n",
|
||
" ])\n",
|
||
" grid_search = GridSearchCV(pipeline, params, cv=5, scoring='neg_mean_absolute_error')\n",
|
||
" grid_search.fit(X_train_reg, y_train_reg)\n",
|
||
" best_model = grid_search.best_estimator_\n",
|
||
" y_pred_reg = best_model.predict(X_test_reg)\n",
|
||
" mae = mean_absolute_error(y_test_reg, y_pred_reg)\n",
|
||
" mse = mean_squared_error(y_test_reg, y_pred_reg)\n",
|
||
" rmse = mean_squared_error(y_test_reg, y_pred_reg, squared=False)\n",
|
||
" r2 = r2_score(y_test_reg, y_pred_reg)\n",
|
||
" print(f\"Model: {name}\")\n",
|
||
" print(f\"Best Parameters: {grid_search.best_params_}\")\n",
|
||
" print(f\"MAE: {mae}\")\n",
|
||
" print(f\"MSE: {mse}\")\n",
|
||
" print(f\"RMSE: {rmse}\")\n",
|
||
" print(f\"R²: {r2}\")\n",
|
||
" print()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"2. Оптимизация тарифной сетки:\n",
|
||
"\n",
|
||
"Настройка гиперпараметров для задачи классификации:"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 93,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Результаты для задачи классификации:\n",
|
||
"Model: Logistic Regression\n",
|
||
"Best Parameters: {'model__C': 10, 'model__solver': 'liblinear'}\n",
|
||
"Accuracy: 0.8864864864864865\n",
|
||
"\n",
|
||
"Model: Random Forest Classification\n",
|
||
"Best Parameters: {'model__max_depth': None, 'model__n_estimators': 100}\n",
|
||
"Accuracy: 0.9765765765765766\n",
|
||
"\n",
|
||
"Model: Gradient Boosting Classification\n",
|
||
"Best Parameters: {'model__learning_rate': 0.1, 'model__max_depth': 5, 'model__n_estimators': 200}\n",
|
||
"Accuracy: 0.9621621621621622\n",
|
||
"\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"from sklearn.model_selection import train_test_split, GridSearchCV\n",
|
||
"from sklearn.preprocessing import StandardScaler\n",
|
||
"from sklearn.linear_model import LogisticRegression\n",
|
||
"from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n",
|
||
"from sklearn.pipeline import Pipeline\n",
|
||
"from sklearn.compose import ColumnTransformer\n",
|
||
"from sklearn.preprocessing import OneHotEncoder\n",
|
||
"from sklearn.metrics import accuracy_score\n",
|
||
"\n",
|
||
"# Загружаем набор данных\n",
|
||
"df = pd.read_csv(\"data/Medical_insurance.csv\")\n",
|
||
"\n",
|
||
"# Определяем категориальные и числовые столбцы\n",
|
||
"categorical_cols = ['sex', 'smoker', 'region']\n",
|
||
"numerical_cols = ['age', 'bmi', 'children']\n",
|
||
"\n",
|
||
"# Создаем преобразователь для категориальных и числовых столбцов\n",
|
||
"preprocessor = ColumnTransformer(\n",
|
||
" transformers=[\n",
|
||
" ('cat', OneHotEncoder(), categorical_cols),\n",
|
||
" ('num', StandardScaler(), numerical_cols)\n",
|
||
" ])\n",
|
||
"\n",
|
||
"# Список моделей и их гиперпараметров для задачи классификации\n",
|
||
"models_class = {\n",
|
||
" \"Logistic Regression\": (LogisticRegression(), {\n",
|
||
" 'model__C': [0.1, 1, 10],\n",
|
||
" 'model__solver': ['liblinear', 'lbfgs']\n",
|
||
" }),\n",
|
||
" \"Random Forest Classification\": (RandomForestClassifier(), {\n",
|
||
" 'model__n_estimators': [100, 200],\n",
|
||
" 'model__max_depth': [None, 10, 20]\n",
|
||
" }),\n",
|
||
" \"Gradient Boosting Classification\": (GradientBoostingClassifier(), {\n",
|
||
" 'model__n_estimators': [100, 200],\n",
|
||
" 'model__learning_rate': [0.01, 0.1],\n",
|
||
" 'model__max_depth': [3, 5]\n",
|
||
" })\n",
|
||
"}\n",
|
||
"\n",
|
||
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи классификации\n",
|
||
"X_class = df[categorical_cols + numerical_cols]\n",
|
||
"y_class = (df['charges'] > df['charges'].mean()).astype(int)\n",
|
||
"\n",
|
||
"# Разделяем данные на обучающую и тестовую выборки для задачи классификации\n",
|
||
"X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)\n",
|
||
"\n",
|
||
"# Обучаем и оцениваем модели для задачи классификации\n",
|
||
"print(\"Результаты для задачи классификации:\")\n",
|
||
"for name, (model, params) in models_class.items():\n",
|
||
" pipeline = Pipeline(steps=[\n",
|
||
" ('preprocessor', preprocessor),\n",
|
||
" ('model', model)\n",
|
||
" ])\n",
|
||
" grid_search = GridSearchCV(pipeline, params, cv=5, scoring='accuracy')\n",
|
||
" grid_search.fit(X_train_class, y_train_class)\n",
|
||
" best_model = grid_search.best_estimator_\n",
|
||
" y_pred_class = best_model.predict(X_test_class)\n",
|
||
" accuracy = accuracy_score(y_test_class, y_pred_class)\n",
|
||
" print(f\"Model: {name}\")\n",
|
||
" print(f\"Best Parameters: {grid_search.best_params_}\")\n",
|
||
" print(f\"Accuracy: {accuracy}\")\n",
|
||
" print()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 94,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Пропуски не обнаружены.\n",
|
||
"Результаты для задачи регрессии:\n",
|
||
"Model: Linear Regression\n",
|
||
"Best Parameters: {}\n",
|
||
"MAE: 4165.371459891892\n",
|
||
"MSE: 39957124.62099743\n",
|
||
"RMSE: 6321.164815205931\n",
|
||
"R²: 0.7396607021732446\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"c:\\Users\\Админ\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
||
" warnings.warn(\n",
|
||
"c:\\Users\\Админ\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
||
" warnings.warn(\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Model: Random Forest Regression\n",
|
||
"Best Parameters: {'model__max_depth': 20, 'model__n_estimators': 200}\n",
|
||
"MAE: 1268.148178236854\n",
|
||
"MSE: 7192020.470646844\n",
|
||
"RMSE: 2681.7942632959084\n",
|
||
"R²: 0.9531406331901089\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"c:\\Users\\Админ\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
||
" warnings.warn(\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Model: Gradient Boosting Regression\n",
|
||
"Best Parameters: {'model__learning_rate': 0.1, 'model__max_depth': 5, 'model__n_estimators': 200}\n",
|
||
"MAE: 1554.914386244714\n",
|
||
"MSE: 9310054.341873825\n",
|
||
"RMSE: 3051.2381653803795\n",
|
||
"R²: 0.9393406549374507\n",
|
||
"\n",
|
||
"Результаты для задачи классификации:\n",
|
||
"Model: Logistic Regression\n",
|
||
"Best Parameters: {'model__C': 10, 'model__solver': 'liblinear'}\n",
|
||
"Accuracy: 0.8864864864864865\n",
|
||
"Precision: 1.0\n",
|
||
"Recall: 0.6204819277108434\n",
|
||
"F1-score: 0.7657992565055762\n",
|
||
"AUC: 0.8852866478768545\n",
|
||
"MAE: 0.17141106227002562\n",
|
||
"MSE: 0.0889310663866395\n",
|
||
"RMSE: 0.29821312242528747\n",
|
||
"R²: 0.5757891454185178\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"c:\\Users\\Админ\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
|
||
" warnings.warn(\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"image/png": "",
|
||
"text/plain": [
|
||
"<Figure size 640x480 with 2 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"image/png": "",
|
||
"text/plain": [
|
||
"<Figure size 640x480 with 1 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Model: Random Forest Classification\n",
|
||
"Best Parameters: {'model__max_depth': 20, 'model__n_estimators': 100}\n",
|
||
"Accuracy: 0.9765765765765766\n",
|
||
"Precision: 0.9872611464968153\n",
|
||
"Recall: 0.9337349397590361\n",
|
||
"F1-score: 0.9597523219814241\n",
|
||
"AUC: 0.9846300368569394\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"image/png": "",
|
||
"text/plain": [
|
||
"<Figure size 640x480 with 2 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"image/png": "",
|
||
"text/plain": [
|
||
"<Figure size 640x480 with 1 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Model: Gradient Boosting Classification\n",
|
||
"Best Parameters: {'model__learning_rate': 0.1, 'model__max_depth': 5, 'model__n_estimators': 200}\n",
|
||
"Accuracy: 0.9621621621621622\n",
|
||
"Precision: 0.9738562091503268\n",
|
||
"Recall: 0.8975903614457831\n",
|
||
"F1-score: 0.9341692789968652\n",
|
||
"AUC: 0.9856211478303961\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"image/png": "",
|
||
"text/plain": [
|
||
"<Figure size 640x480 with 2 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"image/png": "",
|
||
"text/plain": [
|
||
"<Figure size 640x480 with 1 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"import matplotlib.pyplot as plt\n",
|
||
"import seaborn as sns\n",
|
||
"from sklearn.model_selection import train_test_split, GridSearchCV\n",
|
||
"from sklearn.preprocessing import StandardScaler\n",
|
||
"from sklearn.linear_model import LinearRegression, LogisticRegression\n",
|
||
"from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n",
|
||
"from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier\n",
|
||
"from sklearn.pipeline import Pipeline\n",
|
||
"from sklearn.compose import ColumnTransformer\n",
|
||
"from sklearn.preprocessing import OneHotEncoder\n",
|
||
"from sklearn.metrics import (mean_absolute_error, mean_squared_error, r2_score, \n",
|
||
" accuracy_score, precision_score, recall_score, \n",
|
||
" f1_score, confusion_matrix, ConfusionMatrixDisplay, \n",
|
||
" roc_curve, auc)\n",
|
||
"\n",
|
||
"# Загружаем набор данных\n",
|
||
"df = pd.read_csv(\"data/Medical_insurance.csv\")\n",
|
||
"\n",
|
||
"# Проверка наличия пропусков\n",
|
||
"if df.isnull().sum().any():\n",
|
||
" print(\"Пропуски обнаружены в следующих столбцах:\")\n",
|
||
" print(df.isnull().sum())\n",
|
||
"else:\n",
|
||
" print(\"Пропуски не обнаружены.\")\n",
|
||
"\n",
|
||
"# Определяем категориальные и числовые столбцы\n",
|
||
"categorical_cols = ['sex', 'smoker', 'region']\n",
|
||
"numerical_cols = ['age', 'bmi', 'children']\n",
|
||
"\n",
|
||
"# Создаем преобразователь для категориальных и числовых столбцов\n",
|
||
"preprocessor = ColumnTransformer(\n",
|
||
" transformers=[\n",
|
||
" ('cat', OneHotEncoder(), categorical_cols),\n",
|
||
" ('num', StandardScaler(), numerical_cols)\n",
|
||
" ])\n",
|
||
"\n",
|
||
"# Список моделей и их гиперпараметров для задачи регрессии\n",
|
||
"models_reg = {\n",
|
||
" \"Linear Regression\": (LinearRegression(), {}),\n",
|
||
" \"Random Forest Regression\": (RandomForestRegressor(), {\n",
|
||
" 'model__n_estimators': [100, 200],\n",
|
||
" 'model__max_depth': [None, 10, 20]\n",
|
||
" }),\n",
|
||
" \"Gradient Boosting Regression\": (GradientBoostingRegressor(), {\n",
|
||
" 'model__n_estimators': [100, 200],\n",
|
||
" 'model__learning_rate': [0.01, 0.1],\n",
|
||
" 'model__max_depth': [3, 5]\n",
|
||
" })\n",
|
||
"}\n",
|
||
"\n",
|
||
"# Распределяем данные на признаки и целевую переменную для задачи регрессии\n",
|
||
"X_reg = df[categorical_cols + numerical_cols]\n",
|
||
"y_reg = df['charges']\n",
|
||
"\n",
|
||
"# Разделяем данные на обучающую и тестовую выборки для задачи регрессии\n",
|
||
"X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)\n",
|
||
"\n",
|
||
"# Обучаем и оцениваем модели для задачи регрессии\n",
|
||
"print(\"Результаты для задачи регрессии:\")\n",
|
||
"for name, (model, params) in models_reg.items():\n",
|
||
" pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])\n",
|
||
" grid_search = GridSearchCV(pipeline, params, cv=5, scoring='neg_mean_absolute_error')\n",
|
||
" grid_search.fit(X_train_reg, y_train_reg)\n",
|
||
" best_model = grid_search.best_estimator_\n",
|
||
" y_pred_reg = best_model.predict(X_test_reg)\n",
|
||
" mae = mean_absolute_error(y_test_reg, y_pred_reg)\n",
|
||
" mse = mean_squared_error(y_test_reg, y_pred_reg)\n",
|
||
" rmse = mean_squared_error(y_test_reg, y_pred_reg, squared=False)\n",
|
||
" r2 = r2_score(y_test_reg, y_pred_reg)\n",
|
||
" print(f\"Model: {name}\")\n",
|
||
" print(f\"Best Parameters: {grid_search.best_params_}\")\n",
|
||
" print(f\"MAE: {mae}\")\n",
|
||
" print(f\"MSE: {mse}\")\n",
|
||
" print(f\"RMSE: {rmse}\")\n",
|
||
" print(f\"R²: {r2}\\n\")\n",
|
||
"\n",
|
||
"# Список моделей и их гиперпараметров для задачи классификации\n",
|
||
"models_class = {\n",
|
||
" \"Logistic Regression\": (LogisticRegression(max_iter=1000), {\n",
|
||
" 'model__C': [0.1, 1, 10],\n",
|
||
" 'model__solver': ['liblinear', 'lbfgs']\n",
|
||
" }),\n",
|
||
" \"Random Forest Classification\": (RandomForestClassifier(), {\n",
|
||
" 'model__n_estimators': [100, 200],\n",
|
||
" 'model__max_depth': [None, 10, 20]\n",
|
||
" }),\n",
|
||
" \"Gradient Boosting Classification\": (GradientBoostingClassifier(), {\n",
|
||
" 'model__n_estimators': [100, 200],\n",
|
||
" 'model__learning_rate': [0.01, 0.1],\n",
|
||
" 'model__max_depth': [3, 5]\n",
|
||
" })\n",
|
||
"}\n",
|
||
"\n",
|
||
"# Разделяем данные на признаки и целевую переменную для задачи классификации\n",
|
||
"X_class = df[categorical_cols + numerical_cols]\n",
|
||
"y_class = (df['charges'] > df['charges'].mean()).astype(int)\n",
|
||
"\n",
|
||
"# Разделяем данные на обучающую и тестовую выборки для задачи классификации\n",
|
||
"X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, random_state=42)\n",
|
||
"\n",
|
||
"# Обучаем и оцениваем модели для задачи классификации\n",
|
||
"print(\"Результаты для задачи классификации:\")\n",
|
||
"for name, (model, params) in models_class.items():\n",
|
||
" pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])\n",
|
||
" grid_search = GridSearchCV(pipeline, params, cv=5, scoring='accuracy')\n",
|
||
" grid_search.fit(X_train_class, y_train_class)\n",
|
||
" best_model = grid_search.best_estimator_\n",
|
||
" y_pred_class = best_model.predict(X_test_class)\n",
|
||
" y_pred_prob = best_model.predict_proba(X_test_class)[:, 1] # Предсказанные вероятности для позитива\n",
|
||
" accuracy = accuracy_score(y_test_class, y_pred_class)\n",
|
||
" precision = precision_score(y_test_class, y_pred_class)\n",
|
||
" recall = recall_score(y_test_class, y_pred_class)\n",
|
||
" f1 = f1_score(y_test_class, y_pred_class)\n",
|
||
" roc_auc = auc(*roc_curve(y_test_class, y_pred_prob)[:2]) # Добавляем вычисление AUC\n",
|
||
"\n",
|
||
" print(f\"Model: {name}\")\n",
|
||
" print(f\"Best Parameters: {grid_search.best_params_}\")\n",
|
||
" print(f\"Accuracy: {accuracy}\")\n",
|
||
" print(f\"Precision: {precision}\")\n",
|
||
" print(f\"Recall: {recall}\")\n",
|
||
" print(f\"F1-score: {f1}\")\n",
|
||
" print(f\"AUC: {roc_auc}\")\n",
|
||
"\n",
|
||
" # Вычисляем метрики MAE, MSE, RMSE и R² для логистической регрессии\n",
|
||
" if name == \"Logistic Regression\":\n",
|
||
" mae = mean_absolute_error(y_test_class, y_pred_prob)\n",
|
||
" mse = mean_squared_error(y_test_class, y_pred_prob)\n",
|
||
" rmse = mean_squared_error(y_test_class, y_pred_prob, squared=False)\n",
|
||
" r2 = r2_score(y_test_class, y_pred_prob)\n",
|
||
" print(f\"MAE: {mae}\")\n",
|
||
" print(f\"MSE: {mse}\")\n",
|
||
" print(f\"RMSE: {rmse}\")\n",
|
||
" print(f\"R²: {r2}\\n\")\n",
|
||
"\n",
|
||
" # Визуализация матрицы ошибок\n",
|
||
" cm = confusion_matrix(y_test_class, y_pred_class)\n",
|
||
" disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Less', 'More'])\n",
|
||
" disp.plot(cmap=plt.cm.Blues)\n",
|
||
" plt.title(f'Confusion Matrix for {name}')\n",
|
||
" plt.show()\n",
|
||
"\n",
|
||
" # ROC-кривая\n",
|
||
" fpr, tpr, thresholds = roc_curve(y_test_class, y_pred_prob)\n",
|
||
" plt.figure()\n",
|
||
" plt.plot(fpr, tpr, color='blue', label=f'ROC curve (area = {roc_auc:.2f})')\n",
|
||
" plt.plot([0, 1], [0, 1], color='red', linestyle='--')\n",
|
||
" plt.xlim([0.0, 1.0])\n",
|
||
" plt.ylim([0.0, 1.05])\n",
|
||
" plt.xlabel('False Positive Rate')\n",
|
||
" plt.ylabel('True Positive Rate')\n",
|
||
" plt.title(f'ROC Curve for {name}')\n",
|
||
" plt.legend(loc=\"lower right\")\n",
|
||
" plt.show()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Давайте проанализируем полученные значения метрик и определим, являются ли они нормальными или их можно улучшить.\n",
|
||
"\n",
|
||
"### Оценка смещения и дисперсии для задачи регрессии:\n",
|
||
"\n",
|
||
"### Вывод для задачи регрессии:\n",
|
||
"\n",
|
||
"- **Random Forest Regression** демонстрирует наилучшие результаты по метрикам MAE и R², что указывает на высокую точность и стабильность модели.\n",
|
||
"- **Linear Regression** и **Gradient Boosting Regression** также показывают хорошие результаты, но уступают случайному лесу.\n",
|
||
"\n",
|
||
"### Вывод для задачи классификации:\n",
|
||
"\n",
|
||
"- **Random Forest Classification** демонстрирует наилучшие результаты по всем метрикам (Accuracy, Precision, Recall, F1-score), что указывает на высокую точность и стабильность модели.\n",
|
||
"- **Logistic Regression** и **Gradient Boosting Classification** также показывают хорошие результаты, но уступают случайному лесу.\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 95,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Оценка смещения и дисперсии для задачи регрессии:\n",
|
||
"Model: Linear Regression\n",
|
||
"MAE (Cross-Validation): Mean = 4187.079314814689, Std = 91.59869329622924\n",
|
||
"R² (Cross-Validation): Mean = 0.7497258834991787, Std = 0.008634448950998193\n",
|
||
"\n",
|
||
"Model: Random Forest Regression\n",
|
||
"MAE (Cross-Validation): Mean = 940.1339838727054, Std = 81.49494082597198\n",
|
||
"R² (Cross-Validation): Mean = 0.9767515503658635, Std = 0.004497963918334725\n",
|
||
"\n",
|
||
"Model: Gradient Boosting Regression\n",
|
||
"MAE (Cross-Validation): Mean = 2189.5694438069922, Std = 96.31677771605804\n",
|
||
"R² (Cross-Validation): Mean = 0.8873175260899913, Std = 0.011188839103376287\n",
|
||
"\n",
|
||
"Оценка смещения и дисперсии для задачи классификации:\n",
|
||
"Model: Logistic Regression\n",
|
||
"Accuracy (Cross-Validation): Mean = 0.8906956776270857, Std = 0.011756347754179863\n",
|
||
"Precision (Cross-Validation): Mean = 0.9965558019216555, Std = 0.0042291925480556145\n",
|
||
"Recall (Cross-Validation): Mean = 0.6516534480440919, Std = 0.038303791687037965\n",
|
||
"F1-score (Cross-Validation): Mean = 0.7873345301431043, Std = 0.027701698921697982\n",
|
||
"\n",
|
||
"Model: Random Forest Classification\n",
|
||
"Accuracy (Cross-Validation): Mean = 0.9978352359579796, Std = 0.001767524034697101\n",
|
||
"Precision (Cross-Validation): Mean = 0.9976878612716764, Std = 0.002831780049460347\n",
|
||
"Recall (Cross-Validation): Mean = 0.9953757225433526, Std = 0.004325615476039242\n",
|
||
"F1-score (Cross-Validation): Mean = 0.9965250705740019, Std = 0.002837294532624193\n",
|
||
"\n",
|
||
"Model: Gradient Boosting Classification\n",
|
||
"Accuracy (Cross-Validation): Mean = 0.9354219923895014, Std = 0.005560116809131367\n",
|
||
"Precision (Cross-Validation): Mean = 0.9873539623899337, Std = 0.011266123317032629\n",
|
||
"Recall (Cross-Validation): Mean = 0.803226240086033, Std = 0.017698107137353723\n",
|
||
"F1-score (Cross-Validation): Mean = 0.8856692810850337, Std = 0.01067664691021022\n",
|
||
"\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"from sklearn.model_selection import cross_val_score\n",
|
||
"from sklearn.preprocessing import StandardScaler\n",
|
||
"from sklearn.linear_model import LinearRegression, LogisticRegression\n",
|
||
"from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n",
|
||
"from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier\n",
|
||
"from sklearn.pipeline import Pipeline\n",
|
||
"from sklearn.compose import ColumnTransformer\n",
|
||
"from sklearn.preprocessing import OneHotEncoder\n",
|
||
"\n",
|
||
"# Загружаем набор данных\n",
|
||
"df = pd.read_csv(\"data/Medical_insurance.csv\")\n",
|
||
"\n",
|
||
"# Определяем категориальные и числовые столбцы\n",
|
||
"categorical_cols = ['sex', 'smoker', 'region']\n",
|
||
"numerical_cols = ['age', 'bmi', 'children']\n",
|
||
"\n",
|
||
"# Создаем преобразователь для категориальных и числовых столбцов\n",
|
||
"preprocessor = ColumnTransformer(\n",
|
||
" transformers=[\n",
|
||
" ('cat', OneHotEncoder(), categorical_cols),\n",
|
||
" ('num', StandardScaler(), numerical_cols)\n",
|
||
" ])\n",
|
||
"\n",
|
||
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи регрессии\n",
|
||
"X_reg = df[categorical_cols + numerical_cols]\n",
|
||
"y_reg = df['charges']\n",
|
||
"\n",
|
||
"# Список моделей для задачи регрессии\n",
|
||
"models_reg = {\n",
|
||
" \"Linear Regression\": LinearRegression(),\n",
|
||
" \"Random Forest Regression\": RandomForestRegressor(),\n",
|
||
" \"Gradient Boosting Regression\": GradientBoostingRegressor()\n",
|
||
"}\n",
|
||
"\n",
|
||
"# Оценка смещения и дисперсии для задачи регрессии\n",
|
||
"print(\"Оценка смещения и дисперсии для задачи регрессии:\")\n",
|
||
"for name, model in models_reg.items():\n",
|
||
" pipeline = Pipeline(steps=[\n",
|
||
" ('preprocessor', preprocessor),\n",
|
||
" ('model', model)\n",
|
||
" ])\n",
|
||
" mae_scores = -cross_val_score(pipeline, X_reg, y_reg, cv=5, scoring='neg_mean_absolute_error')\n",
|
||
" r2_scores = cross_val_score(pipeline, X_reg, y_reg, cv=5, scoring='r2')\n",
|
||
" print(f\"Model: {name}\")\n",
|
||
" print(f\"MAE (Cross-Validation): Mean = {mae_scores.mean()}, Std = {mae_scores.std()}\")\n",
|
||
" print(f\"R² (Cross-Validation): Mean = {r2_scores.mean()}, Std = {r2_scores.std()}\")\n",
|
||
" print()\n",
|
||
"\n",
|
||
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи классификации\n",
|
||
"X_class = df[categorical_cols + numerical_cols]\n",
|
||
"y_class = (df['charges'] > df['charges'].mean()).astype(int)\n",
|
||
"\n",
|
||
"# Список моделей для задачи классификации\n",
|
||
"models_class = {\n",
|
||
" \"Logistic Regression\": LogisticRegression(),\n",
|
||
" \"Random Forest Classification\": RandomForestClassifier(),\n",
|
||
" \"Gradient Boosting Classification\": GradientBoostingClassifier()\n",
|
||
"}\n",
|
||
"\n",
|
||
"# Оценка смещения и дисперсии для задачи классификации\n",
|
||
"print(\"Оценка смещения и дисперсии для задачи классификации:\")\n",
|
||
"for name, model in models_class.items():\n",
|
||
" pipeline = Pipeline(steps=[\n",
|
||
" ('preprocessor', preprocessor),\n",
|
||
" ('model', model)\n",
|
||
" ])\n",
|
||
" accuracy_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='accuracy')\n",
|
||
" precision_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='precision')\n",
|
||
" recall_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='recall')\n",
|
||
" f1_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='f1')\n",
|
||
" print(f\"Model: {name}\")\n",
|
||
" print(f\"Accuracy (Cross-Validation): Mean = {accuracy_scores.mean()}, Std = {accuracy_scores.std()}\")\n",
|
||
" print(f\"Precision (Cross-Validation): Mean = {precision_scores.mean()}, Std = {precision_scores.std()}\")\n",
|
||
" print(f\"Recall (Cross-Validation): Mean = {recall_scores.mean()}, Std = {recall_scores.std()}\")\n",
|
||
" print(f\"F1-score (Cross-Validation): Mean = {f1_scores.mean()}, Std = {f1_scores.std()}\")\n",
|
||
" print()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 96,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"image/png": "",
|
||
"text/plain": [
|
||
"<Figure size 1200x600 with 2 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"image/png": "",
|
||
"text/plain": [
|
||
"<Figure size 1200x1200 with 4 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"import matplotlib.pyplot as plt\n",
|
||
"from sklearn.model_selection import cross_val_score\n",
|
||
"from sklearn.preprocessing import StandardScaler\n",
|
||
"from sklearn.linear_model import LinearRegression, LogisticRegression\n",
|
||
"from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n",
|
||
"from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier\n",
|
||
"from sklearn.pipeline import Pipeline\n",
|
||
"from sklearn.compose import ColumnTransformer\n",
|
||
"from sklearn.preprocessing import OneHotEncoder\n",
|
||
"\n",
|
||
"# Загружаем набор данных\n",
|
||
"df = pd.read_csv(\"data/Medical_insurance.csv\")\n",
|
||
"\n",
|
||
"# Определяем категориальные и числовые столбцы\n",
|
||
"categorical_cols = ['sex', 'smoker', 'region']\n",
|
||
"numerical_cols = ['age', 'bmi', 'children']\n",
|
||
"\n",
|
||
"# Создаем преобразователь для категориальных и числовых столбцов\n",
|
||
"preprocessor = ColumnTransformer(\n",
|
||
" transformers=[\n",
|
||
" ('cat', OneHotEncoder(), categorical_cols),\n",
|
||
" ('num', StandardScaler(), numerical_cols)\n",
|
||
" ])\n",
|
||
"\n",
|
||
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи регрессии\n",
|
||
"X_reg = df[categorical_cols + numerical_cols]\n",
|
||
"y_reg = df['charges']\n",
|
||
"\n",
|
||
"# Список моделей для задачи регрессии\n",
|
||
"models_reg = {\n",
|
||
" \"Linear Regression\": LinearRegression(),\n",
|
||
" \"Random Forest Regression\": RandomForestRegressor(),\n",
|
||
" \"Gradient Boosting Regression\": GradientBoostingRegressor()\n",
|
||
"}\n",
|
||
"\n",
|
||
"# Оценка смещения и дисперсии для задачи регрессии\n",
|
||
"mae_means = []\n",
|
||
"mae_stds = []\n",
|
||
"r2_means = []\n",
|
||
"r2_stds = []\n",
|
||
"\n",
|
||
"for name, model in models_reg.items():\n",
|
||
" pipeline = Pipeline(steps=[\n",
|
||
" ('preprocessor', preprocessor),\n",
|
||
" ('model', model)\n",
|
||
" ])\n",
|
||
" mae_scores = -cross_val_score(pipeline, X_reg, y_reg, cv=5, scoring='neg_mean_absolute_error')\n",
|
||
" r2_scores = cross_val_score(pipeline, X_reg, y_reg, cv=5, scoring='r2')\n",
|
||
" mae_means.append(mae_scores.mean())\n",
|
||
" mae_stds.append(mae_scores.std())\n",
|
||
" r2_means.append(r2_scores.mean())\n",
|
||
" r2_stds.append(r2_scores.std())\n",
|
||
"\n",
|
||
"# Визуализация результатов для задачи регрессии\n",
|
||
"fig, ax = plt.subplots(1, 2, figsize=(12, 6))\n",
|
||
"\n",
|
||
"ax[0].bar(models_reg.keys(), mae_means, yerr=mae_stds, align='center', alpha=0.5, ecolor='black', capsize=10)\n",
|
||
"ax[0].set_ylabel('MAE')\n",
|
||
"ax[0].set_title('Mean Absolute Error (MAE) for Regression Models')\n",
|
||
"ax[0].yaxis.grid(True)\n",
|
||
"\n",
|
||
"ax[1].bar(models_reg.keys(), r2_means, yerr=r2_stds, align='center', alpha=0.5, ecolor='black', capsize=10)\n",
|
||
"ax[1].set_ylabel('R²')\n",
|
||
"ax[1].set_title('R-squared (R²) for Regression Models')\n",
|
||
"ax[1].yaxis.grid(True)\n",
|
||
"\n",
|
||
"plt.tight_layout()\n",
|
||
"plt.show()\n",
|
||
"\n",
|
||
"# Разделяем данные на признаки (X) и целевую переменную (y) для задачи классификации\n",
|
||
"X_class = df[categorical_cols + numerical_cols]\n",
|
||
"y_class = (df['charges'] > df['charges'].mean()).astype(int)\n",
|
||
"\n",
|
||
"# Список моделей для задачи классификации\n",
|
||
"models_class = {\n",
|
||
" \"Logistic Regression\": LogisticRegression(),\n",
|
||
" \"Random Forest Classification\": RandomForestClassifier(),\n",
|
||
" \"Gradient Boosting Classification\": GradientBoostingClassifier()\n",
|
||
"}\n",
|
||
"\n",
|
||
"# Оценка смещения и дисперсии для задачи классификации\n",
|
||
"accuracy_means = []\n",
|
||
"accuracy_stds = []\n",
|
||
"precision_means = []\n",
|
||
"precision_stds = []\n",
|
||
"recall_means = []\n",
|
||
"recall_stds = []\n",
|
||
"f1_means = []\n",
|
||
"f1_stds = []\n",
|
||
"\n",
|
||
"for name, model in models_class.items():\n",
|
||
" pipeline = Pipeline(steps=[\n",
|
||
" ('preprocessor', preprocessor),\n",
|
||
" ('model', model)\n",
|
||
" ])\n",
|
||
" accuracy_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='accuracy')\n",
|
||
" precision_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='precision')\n",
|
||
" recall_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='recall')\n",
|
||
" f1_scores = cross_val_score(pipeline, X_class, y_class, cv=5, scoring='f1')\n",
|
||
" accuracy_means.append(accuracy_scores.mean())\n",
|
||
" accuracy_stds.append(accuracy_scores.std())\n",
|
||
" precision_means.append(precision_scores.mean())\n",
|
||
" precision_stds.append(precision_scores.std())\n",
|
||
" recall_means.append(recall_scores.mean())\n",
|
||
" recall_stds.append(recall_scores.std())\n",
|
||
" f1_means.append(f1_scores.mean())\n",
|
||
" f1_stds.append(f1_scores.std())\n",
|
||
"\n",
|
||
"# Визуализация результатов для задачи классификации\n",
|
||
"fig, ax = plt.subplots(2, 2, figsize=(12, 12))\n",
|
||
"\n",
|
||
"ax[0, 0].bar(models_class.keys(), accuracy_means, yerr=accuracy_stds, align='center', alpha=0.5, ecolor='black', capsize=10)\n",
|
||
"ax[0, 0].set_ylabel('Accuracy')\n",
|
||
"ax[0, 0].set_title('Accuracy for Classification Models')\n",
|
||
"ax[0, 0].yaxis.grid(True)\n",
|
||
"\n",
|
||
"ax[0, 1].bar(models_class.keys(), precision_means, yerr=precision_stds, align='center', alpha=0.5, ecolor='black', capsize=10)\n",
|
||
"ax[0, 1].set_ylabel('Precision')\n",
|
||
"ax[0, 1].set_title('Precision for Classification Models')\n",
|
||
"ax[0, 1].yaxis.grid(True)\n",
|
||
"\n",
|
||
"ax[1, 0].bar(models_class.keys(), recall_means, yerr=recall_stds, align='center', alpha=0.5, ecolor='black', capsize=10)\n",
|
||
"ax[1, 0].set_ylabel('Recall')\n",
|
||
"ax[1, 0].set_title('Recall for Classification Models')\n",
|
||
"ax[1, 0].yaxis.grid(True)\n",
|
||
"\n",
|
||
"ax[1, 1].bar(models_class.keys(), f1_means, yerr=f1_stds, align='center', alpha=0.5, ecolor='black', capsize=10)\n",
|
||
"ax[1, 1].set_ylabel('F1-score')\n",
|
||
"ax[1, 1].set_title('F1-score for Classification Models')\n",
|
||
"ax[1, 1].yaxis.grid(True)\n",
|
||
"\n",
|
||
"plt.tight_layout()\n",
|
||
"plt.show()"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.11.5"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|