MAI_ISE-31_Andrikhov-A-S/lab3.ipynb
2024-11-09 09:10:23 +04:00

1102 lines
364 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Лабораторная работа 3."
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['HeartDisease', 'BMI', 'Smoking', 'AlcoholDrinking', 'Stroke',\n",
" 'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex', 'AgeCategory',\n",
" 'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'SleepTime',\n",
" 'Asthma', 'KidneyDisease', 'SkinCancer'],\n",
" dtype='object')\n"
]
}
],
"source": [
"import pandas as pd\n",
"df = pd.read_csv(\"./datasets/var2/2020/heart_2020_cleaned.csv\")\n",
"print(df.columns)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Бизнес-цели:\n",
"- Разработка персонализированных программ профилактики сердечно-сосудистых заболеваний. Цель технического проекта: создание модели машинного обучения, которая будет прогнозировать риск сердечного приступа для каждого пациента на основе его индивидуальных факторов риска, и разработка онлайн-платформы или приложения для предоставления персонализированных рекомендаций по профилактике.\n",
"- Улучшение качества медицинской Цель технического проекта: использование данных для выявления групп населения с наибольшим риском сердечного приступа и разработки целевых программ профилактики и раннего выявления заболеваний.\n",
"\n",
"#### Выполним разбиение на 3 выборки: обучающую, контрольную и тестовую"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Размер обучающей выборки: 156699\n",
"Размер контрольной выборки: 67157\n",
"Размер тестовой выборки: 95939\n"
]
}
],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"df = pd.read_csv(\"./datasets/var2/2020/heart_2020_cleaned.csv\")\n",
"\n",
"# Разделение на обучающую и тестовую выборки (например, 70% обучающая, 30% тестовая)\n",
"train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)\n",
"\n",
"# Разделение обучающей выборки на обучающую и контрольную (например, 70% обучающая, 30% контрольная)\n",
"train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)\n",
"\n",
"# Вывод размеров выборок\n",
"print(\"Размер обучающей выборки:\", len(train_df))\n",
"print(\"Размер контрольной выборки:\", len(val_df))\n",
"print(\"Размер тестовой выборки:\", len(test_df))\n"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение классов в HeartDisease:\n",
"HeartDisease\n",
"No 292422\n",
"Yes 27373\n",
"Name: count, dtype: int64\n"
]
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение классов в Обучающей выборке:\n",
"HeartDisease\n",
"No 143331\n",
"Yes 13368\n",
"Name: count, dtype: int64\n"
]
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение классов в Контрольной выборке:\n",
"HeartDisease\n",
"No 61442\n",
"Yes 5715\n",
"Name: count, dtype: int64\n"
]
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение классов в Тестовой выборке:\n",
"HeartDisease\n",
"No 87649\n",
"Yes 8290\n",
"Name: count, dtype: int64\n"
]
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"# Проверка распределения классов в целевой переменной\n",
"class_distribution = df['HeartDisease'].value_counts()\n",
"print(\"Распределение классов в HeartDisease:\")\n",
"print(class_distribution)\n",
"\n",
"# Визуализация распределения классов\n",
"sns.countplot(x='HeartDisease', data=df)\n",
"plt.title('Распределение классов в HeartDisease')\n",
"plt.show()\n",
"\n",
"# Проверка сбалансированности для каждой выборки\n",
"def check_balance(df, title):\n",
" class_distribution = df['HeartDisease'].value_counts()\n",
" print(f\"Распределение классов в {title}:\")\n",
" print(class_distribution)\n",
" sns.countplot(x='HeartDisease', data=df)\n",
" plt.title(f'Распределение классов в {title}')\n",
" plt.show()\n",
"\n",
"# Проверка сбалансированности для обучающей, контрольной и тестовой выборок\n",
"check_balance(train_df, 'Обучающей выборке')\n",
"check_balance(val_df, 'Контрольной выборке')\n",
"check_balance(test_df, 'Тестовой выборке')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Можно заметить, что данные не сбалансированы - во всех выборках количество значений \"No\" превышает \"Yes\" в среднем в 10 раз. Для балансировки данных будет применен метод upsampling"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Распределение классов в всем датасете:\n",
"Класс No: 292422 (91.44%)\n",
"Класс Yes: 27373 (8.56%)\n",
"\n",
"Распределение классов в Обучающей выборке до upsampling:\n",
"Класс No: 143331 (91.47%)\n",
"Класс Yes: 13368 (8.53%)\n",
"Размер обучающей выборки после upsampling: 286662\n",
"\n",
"Распределение классов в Обучающей выборке после upsampling:\n",
"Класс No: 143331 (50.00%)\n",
"Класс Yes: 143331 (50.00%)\n",
"\n",
"Распределение классов в Контрольной выборке:\n",
"Класс No: 61442 (91.49%)\n",
"Класс Yes: 5715 (8.51%)\n",
"\n",
"Распределение классов в Тестовой выборке:\n",
"Класс No: 87649 (91.36%)\n",
"Класс Yes: 8290 (8.64%)\n"
]
}
],
"source": [
"from imblearn.over_sampling import RandomOverSampler\n",
"\n",
"# Функция для проверки балансировки данных\n",
"def check_balance(df, title):\n",
" class_distribution = df['HeartDisease'].value_counts()\n",
" print(f\"\\nРаспределение классов в {title}:\")\n",
" for cls, count in class_distribution.items():\n",
" print(f\"Класс {cls}: {count} ({count / len(df) * 100:.2f}%)\")\n",
"\n",
"# Проверка балансировки для всего датасета\n",
"check_balance(df, 'всем датасете')\n",
"\n",
"# Проверка балансировки для обучающей выборки до upsampling\n",
"check_balance(train_df, 'Обучающей выборке до upsampling')\n",
"\n",
"# Применение upsampling к обучающей выборке\n",
"X_train = train_df.drop('HeartDisease', axis=1) # Отделяем признаки от целевой переменной\n",
"y_train = train_df['HeartDisease'] # Целевая переменная\n",
"\n",
"# Инициализация RandomOverSampler\n",
"ros = RandomOverSampler(random_state=42)\n",
"\n",
"# Применение upsampling\n",
"X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)\n",
"\n",
"# Создание нового DataFrame с балансированными данными\n",
"train_df_resampled = pd.concat([X_train_resampled, y_train_resampled], axis=1)\n",
"\n",
"# Вывод размеров выборок после upsampling\n",
"print(\"Размер обучающей выборки после upsampling:\", len(train_df_resampled))\n",
"\n",
"# Проверка балансировки для обучающей выборки после upsampling\n",
"check_balance(train_df_resampled, 'Обучающей выборке после upsampling')\n",
"\n",
"# Проверка балансировки для контрольной и тестовой выборок (они не должны измениться)\n",
"check_balance(val_df, 'Контрольной выборке')\n",
"check_balance(test_df, 'Тестовой выборке')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Данные были сбалансированы. Теперь можно перейти к конструированию признаков. Поставлены следующие задачи:\n",
"- Разработка персонализированных программ профилактики сердечно-сосудистых заболеваний\n",
"- Улучшение качества медицинской помощи"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Унитарное кодирование категориальных признаков (one-hot encoding)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"# Определение категориальных признаков\n",
"categorical_features = [\n",
" 'Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'Sex', 'AgeCategory', 'Race',\n",
" 'Diabetic', 'PhysicalActivity', 'GenHealth', 'Asthma', 'KidneyDisease', 'SkinCancer'\n",
"]\n",
"\n",
"# Применение one-hot encoding к обучающей выборке\n",
"train_df_resampled_encoded = pd.get_dummies(train_df_resampled, columns=categorical_features)\n",
"\n",
"# Применение one-hot encoding к контрольной выборке\n",
"val_df_encoded = pd.get_dummies(val_df, columns=categorical_features)\n",
"\n",
"# Применение one-hot encoding к тестовой выборке\n",
"test_df_encoded = pd.get_dummies(test_df, columns=categorical_features)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Дискретизация числовых признаков"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"# Применение upsampling к обучающей выборке\n",
"X_train = train_df.drop('HeartDisease', axis=1) # Отделяем признаки от целевой переменной\n",
"y_train = train_df['HeartDisease'] # Целевая переменная\n",
"\n",
"# Инициализация RandomOverSampler\n",
"ros = RandomOverSampler(random_state=42)\n",
"\n",
"# Применение upsampling\n",
"X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)\n",
"\n",
"# Создание нового DataFrame с балансированными данными\n",
"train_df_resampled = pd.concat([X_train_resampled, y_train_resampled], axis=1)\n",
"\n",
"# Определение числовых признаков для дискретизации\n",
"numerical_features = ['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime']\n",
"\n",
"# Функция для дискретизации числовых признаков\n",
"def discretize_features(df, features, bins=5, labels=False):\n",
" for feature in features:\n",
" df[f'{feature}_bin'] = pd.cut(df[feature], bins=bins, labels=labels)\n",
" return df\n",
"\n",
"# Применение дискретизации к обучающей, контрольной и тестовой выборкам\n",
"train_df_resampled = discretize_features(train_df_resampled, numerical_features)\n",
"val_df = discretize_features(val_df, numerical_features)\n",
"test_df = discretize_features(test_df, numerical_features)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Ручной синтез. Создание новых признаков на основе экспертных знаний и логики предметной области. Например, для данных о продаже автомобилей можно создать признак \"возраст автомобиля\" как разницу между текущим годом и годом выпуска."
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"# Применение upsampling к обучающей выборке\n",
"X_train = train_df.drop('HeartDisease', axis=1) # Отделяем признаки от целевой переменной\n",
"y_train = train_df['HeartDisease'] # Целевая переменная\n",
"\n",
"# Инициализация RandomOverSampler\n",
"ros = RandomOverSampler(random_state=42)\n",
"\n",
"# Применение upsampling\n",
"X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)\n",
"\n",
"# Создание нового DataFrame с балансированными данными\n",
"train_df_resampled = pd.concat([X_train_resampled, y_train_resampled], axis=1)\n",
"\n",
"# Создание нового признака \"Age\" на основе категориального признака \"AgeCategory\"\n",
"age_mapping = {\n",
" '18-24': 21,\n",
" '25-29': 27,\n",
" '30-34': 32,\n",
" '35-39': 37,\n",
" '40-44': 42,\n",
" '45-49': 47,\n",
" '50-54': 52,\n",
" '55-59': 57,\n",
" '60-64': 62,\n",
" '65-69': 67,\n",
" '70-74': 72,\n",
" '75-79': 77,\n",
" '80 or older': 80\n",
"}\n",
"\n",
"train_df_resampled['Age'] = train_df_resampled['AgeCategory'].map(age_mapping)\n",
"val_df['Age'] = val_df['AgeCategory'].map(age_mapping)\n",
"test_df['Age'] = test_df['AgeCategory'].map(age_mapping)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Масштабирование признаков - это процесс преобразования числовых признаков таким образом, чтобы они имели одинаковый масштаб. Это важно для многих алгоритмов машинного обучения, которые чувствительны к масштабу признаков, таких как линейная регрессия, метод опорных векторов (SVM) и нейронные сети."
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"from imblearn.over_sampling import RandomOverSampler\n",
"from sklearn.preprocessing import StandardScaler\n",
"\n",
"# Применение upsampling к обучающей выборке\n",
"X_train = train_df.drop('HeartDisease', axis=1) # Отделяем признаки от целевой переменной\n",
"y_train = train_df['HeartDisease'] # Целевая переменная\n",
"\n",
"# Инициализация RandomOverSampler\n",
"ros = RandomOverSampler(random_state=42)\n",
"\n",
"# Применение upsampling\n",
"X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)\n",
"\n",
"# Создание нового DataFrame с балансированными данными\n",
"train_df_resampled = pd.concat([X_train_resampled, y_train_resampled], axis=1)\n",
"\n",
"# Создание нового признака \"Age\" на основе категориального признака \"AgeCategory\"\n",
"age_mapping = {\n",
" '18-24': 21,\n",
" '25-29': 27,\n",
" '30-34': 32,\n",
" '35-39': 37,\n",
" '40-44': 42,\n",
" '45-49': 47,\n",
" '50-54': 52,\n",
" '55-59': 57,\n",
" '60-64': 62,\n",
" '65-69': 67,\n",
" '70-74': 72,\n",
" '75-79': 77,\n",
" '80 or older': 80\n",
"}\n",
"\n",
"train_df_resampled['Age'] = train_df_resampled['AgeCategory'].map(age_mapping)\n",
"val_df['Age'] = val_df['AgeCategory'].map(age_mapping)\n",
"test_df['Age'] = test_df['AgeCategory'].map(age_mapping)\n",
"\n",
"# Определение числовых признаков для масштабирования\n",
"numerical_features_to_scale = ['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime', 'Age']\n",
"\n",
"# Инициализация StandardScaler\n",
"scaler = StandardScaler()\n",
"\n",
"# Масштабирование числовых признаков в обучающей выборке\n",
"train_df_resampled[numerical_features_to_scale] = scaler.fit_transform(train_df_resampled[numerical_features_to_scale])\n",
"\n",
"# Масштабирование числовых признаков в контрольной и тестовой выборках\n",
"val_df[numerical_features_to_scale] = scaler.transform(val_df[numerical_features_to_scale])\n",
"test_df[numerical_features_to_scale] = scaler.transform(test_df[numerical_features_to_scale])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Конструирование признаков с применением фреймворка Featuretools"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:1733: UserWarning: index id not found in dataframe, creating new integer column\n",
" warnings.warn(\n",
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n",
" warnings.warn(\n",
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
" df = pd.concat([df, default_df], sort=True)\n",
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
" df = pd.concat([df, default_df], sort=True)\n",
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
" df = pd.concat([df, default_df], sort=True)\n",
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
" df = pd.concat([df, default_df], sort=True)\n",
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
" df = pd.concat([df, default_df], sort=True)\n",
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
" series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n",
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
" df = pd.concat([df, default_df], sort=True)\n",
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
" df = pd.concat([df, default_df], sort=True)\n",
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
" df = pd.concat([df, default_df], sort=True)\n",
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
" df = pd.concat([df, default_df], sort=True)\n",
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
" df = pd.concat([df, default_df], sort=True)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Обучающая выборка после конструирования признаков:\n",
" HeartDisease BMI Smoking AlcoholDrinking Stroke PhysicalHealth \\\n",
"id \n",
"0 False 32.23 False False False 0.0 \n",
"1 False 29.53 False False False 0.0 \n",
"2 False 30.13 False False False 0.0 \n",
"3 False 35.43 False False False 0.0 \n",
"4 False 29.53 False False False 0.0 \n",
"\n",
" MentalHealth DiffWalking Sex AgeCategory Race Diabetic \\\n",
"id \n",
"0 0.0 True Male 75-79 White Yes \n",
"1 0.0 True Female 50-54 White No \n",
"2 0.0 False Male 50-54 White No \n",
"3 15.0 False Female 18-24 Hispanic No \n",
"4 0.0 True Female 65-69 White Yes \n",
"\n",
" PhysicalActivity GenHealth SleepTime Asthma KidneyDisease SkinCancer \\\n",
"id \n",
"0 True Fair 6.0 False True True \n",
"1 True Very good 8.0 False False False \n",
"2 True Excellent 7.0 False False False \n",
"3 True Good 7.0 False False False \n",
"4 False Good 10.0 False False False \n",
"\n",
" Age \n",
"id \n",
"0 77 \n",
"1 52 \n",
"2 52 \n",
"3 21 \n",
"4 67 \n",
"Контрольная выборка после конструирования признаков:\n",
" HeartDisease BMI Smoking AlcoholDrinking Stroke PhysicalHealth \\\n",
"id \n",
"80125 False 22.71 False False False 0.0 \n",
"116296 False 25.80 False False False 0.0 \n",
"18780 False 17.74 True False False 0.0 \n",
"233006 <NA> NaN <NA> <NA> <NA> NaN \n",
"182306 <NA> NaN <NA> <NA> <NA> NaN \n",
"\n",
" MentalHealth DiffWalking Sex AgeCategory Race Diabetic \\\n",
"id \n",
"80125 0.0 False Male 25-29 White No \n",
"116296 0.0 False Male 50-54 Hispanic No \n",
"18780 0.0 False Female 65-69 White No \n",
"233006 NaN <NA> NaN NaN NaN NaN \n",
"182306 NaN <NA> NaN NaN NaN NaN \n",
"\n",
" PhysicalActivity GenHealth SleepTime Asthma KidneyDisease \\\n",
"id \n",
"80125 True Excellent 8.0 False False \n",
"116296 True Good 7.0 False False \n",
"18780 True Good 7.0 False False \n",
"233006 <NA> NaN NaN <NA> <NA> \n",
"182306 <NA> NaN NaN <NA> <NA> \n",
"\n",
" SkinCancer Age \n",
"id \n",
"80125 False 27 \n",
"116296 False 52 \n",
"18780 False 67 \n",
"233006 <NA> <NA> \n",
"182306 <NA> <NA> \n",
"Тестовая выборка после конструирования признаков:\n",
" HeartDisease BMI Smoking AlcoholDrinking Stroke PhysicalHealth \\\n",
"id \n",
"271884 <NA> NaN <NA> <NA> <NA> NaN \n",
"270361 <NA> NaN <NA> <NA> <NA> NaN \n",
"219060 <NA> NaN <NA> <NA> <NA> NaN \n",
"24010 False 26.5 False False False 14.0 \n",
"181930 <NA> NaN <NA> <NA> <NA> NaN \n",
"\n",
" MentalHealth DiffWalking Sex AgeCategory Race Diabetic \\\n",
"id \n",
"271884 NaN <NA> NaN NaN NaN NaN \n",
"270361 NaN <NA> NaN NaN NaN NaN \n",
"219060 NaN <NA> NaN NaN NaN NaN \n",
"24010 0.0 True Male 75-79 White Yes \n",
"181930 NaN <NA> NaN NaN NaN NaN \n",
"\n",
" PhysicalActivity GenHealth SleepTime Asthma KidneyDisease \\\n",
"id \n",
"271884 <NA> NaN NaN <NA> <NA> \n",
"270361 <NA> NaN NaN <NA> <NA> \n",
"219060 <NA> NaN NaN <NA> <NA> \n",
"24010 True Excellent 9.0 False False \n",
"181930 <NA> NaN NaN <NA> <NA> \n",
"\n",
" SkinCancer Age \n",
"id \n",
"271884 <NA> <NA> \n",
"270361 <NA> <NA> \n",
"219060 <NA> <NA> \n",
"24010 False 77 \n",
"181930 <NA> <NA> \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
" series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n"
]
}
],
"source": [
"import featuretools as ft\n",
"\n",
"# Создание нового признака \"Age\" на основе категориального признака \"AgeCategory\"\n",
"age_mapping = {\n",
" '18-24': 21,\n",
" '25-29': 27,\n",
" '30-34': 32,\n",
" '35-39': 37,\n",
" '40-44': 42,\n",
" '45-49': 47,\n",
" '50-54': 52,\n",
" '55-59': 57,\n",
" '60-64': 62,\n",
" '65-69': 67,\n",
" '70-74': 72,\n",
" '75-79': 77,\n",
" '80 or older': 80\n",
"}\n",
"\n",
"train_df['Age'] = train_df['AgeCategory'].map(age_mapping)\n",
"val_df['Age'] = val_df['AgeCategory'].map(age_mapping)\n",
"test_df['Age'] = test_df['AgeCategory'].map(age_mapping)\n",
"\n",
"# Определение сущностей\n",
"es = ft.EntitySet(id='heart_data')\n",
"es = es.add_dataframe(dataframe_name='train', dataframe=train_df, index='id')\n",
"\n",
"# Генерация признаков\n",
"feature_matrix, feature_defs = ft.dfs(entityset=es, target_dataframe_name='train', max_depth=2)\n",
"\n",
"# Преобразование признаков для контрольной и тестовой выборок\n",
"val_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=val_df.index)\n",
"test_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=test_df.index)\n",
"\n",
"# Вывод первых нескольких строк для проверки\n",
"print(\"Обучающая выборка после конструирования признаков:\")\n",
"print(feature_matrix.head())\n",
"print(\"Контрольная выборка после конструирования признаков:\")\n",
"print(val_feature_matrix.head())\n",
"print(\"Тестовая выборка после конструирования признаков:\")\n",
"print(test_feature_matrix.head())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Оценка качества каждого набора признаков\n",
"- Предсказательная способность Метрики: RMSE, MAE, R²\n",
"- Методы: Обучение модели на обучающей выборке и оценка на контрольной и тестовой выборках.\n",
"- Скорость вычисления Методы: Измерение времени выполнения генерации признаков и обучения модели.\n",
"- Надежность Методы: Кросс-валидация, анализ чувствительности модели к изменениям в данных.\n",
"- Корреляция Методы: Анализ корреляционной матрицы признаков, удаление мультиколлинеарных признаков.\n",
"- Цельность Методы: Проверка логической связи между признаками и целевой переменной, интерпретация результатов модели."
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Размер обучающей выборки: 156699\n",
"Размер контрольной выборки: 67157\n",
"Размер тестовой выборки: 95939\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n",
" warnings.warn(\n",
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
" df = pd.concat([df, default_df], sort=True)\n",
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
" series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n",
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
" df = pd.concat([df, default_df], sort=True)\n",
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
" series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Feature Importance:\n",
" feature importance\n",
"0 HeartDisease 0.854583\n",
"5 Age 0.016474\n",
"1 BMI 0.013826\n",
"11 Stroke_True 0.009034\n",
"2 PhysicalHealth 0.008447\n",
"12 DiffWalking_False 0.008264\n",
"4 SleepTime 0.007288\n",
"37 Diabetic_Yes 0.007211\n",
"10 Stroke_False 0.006678\n",
"44 GenHealth_Poor 0.005835\n",
"42 GenHealth_Fair 0.005484\n",
"13 DiffWalking_True 0.005208\n",
"3 MentalHealth 0.004902\n",
"35 Diabetic_No 0.003429\n",
"48 KidneyDisease_False 0.003063\n",
"28 AgeCategory_80 or older 0.003052\n",
"14 Sex_Female 0.002828\n",
"15 Sex_Male 0.002511\n",
"49 KidneyDisease_True 0.002422\n",
"6 Smoking_False 0.001900\n",
"7 Smoking_True 0.001779\n",
"41 GenHealth_Excellent 0.001690\n",
"45 GenHealth_Very good 0.001673\n",
"39 PhysicalActivity_False 0.001611\n",
"43 GenHealth_Good 0.001548\n",
"34 Race_White 0.001539\n",
"40 PhysicalActivity_True 0.001510\n",
"50 SkinCancer_False 0.001392\n",
"51 SkinCancer_True 0.001354\n",
"47 Asthma_True 0.001334\n",
"46 Asthma_False 0.001333\n",
"27 AgeCategory_75-79 0.001169\n",
"26 AgeCategory_70-74 0.000973\n",
"31 Race_Black 0.000868\n",
"24 AgeCategory_60-64 0.000837\n",
"32 Race_Hispanic 0.000803\n",
"25 AgeCategory_65-69 0.000783\n",
"33 Race_Other 0.000662\n",
"23 AgeCategory_55-59 0.000617\n",
"8 AlcoholDrinking_False 0.000559\n",
"9 AlcoholDrinking_True 0.000550\n",
"29 Race_American Indian/Alaskan Native 0.000514\n",
"36 Diabetic_No, borderline diabetes 0.000471\n",
"22 AgeCategory_50-54 0.000441\n",
"21 AgeCategory_45-49 0.000326\n",
"20 AgeCategory_40-44 0.000283\n",
"30 Race_Asian 0.000265\n",
"19 AgeCategory_35-39 0.000218\n",
"38 Diabetic_Yes (during pregnancy) 0.000135\n",
"18 AgeCategory_30-34 0.000124\n",
"17 AgeCategory_25-29 0.000108\n",
"16 AgeCategory_18-24 0.000094\n"
]
}
],
"source": [
"from imblearn.over_sampling import RandomOverSampler\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"\n",
"# Вывод размеров выборок\n",
"print(\"Размер обучающей выборки:\", len(train_df))\n",
"print(\"Размер контрольной выборки:\", len(val_df))\n",
"print(\"Размер тестовой выборки:\", len(test_df))\n",
"\n",
"# Определение категориальных признаков\n",
"categorical_features = [\n",
" 'Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'Sex', 'AgeCategory', 'Race',\n",
" 'Diabetic', 'PhysicalActivity', 'GenHealth', 'Asthma', 'KidneyDisease', 'SkinCancer'\n",
"]\n",
"\n",
"# Применение one-hot encoding к обучающей выборке\n",
"train_df_encoded = pd.get_dummies(train_df, columns=categorical_features)\n",
"\n",
"# Применение one-hot encoding к контрольной выборке\n",
"val_df_encoded = pd.get_dummies(val_df, columns=categorical_features)\n",
"\n",
"# Применение one-hot encoding к тестовой выборке\n",
"test_df_encoded = pd.get_dummies(test_df, columns=categorical_features)\n",
"\n",
"# Определение сущностей\n",
"es = ft.EntitySet(id='heart_data')\n",
"es = es.add_dataframe(dataframe_name='heart', dataframe=train_df_encoded, index='id')\n",
"\n",
"# Генерация признаков\n",
"feature_matrix, feature_defs = ft.dfs(entityset=es, target_dataframe_name='heart', max_depth=2)\n",
"\n",
"# Преобразование признаков для контрольной и тестовой выборок\n",
"val_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=val_df_encoded.index)\n",
"test_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=test_df_encoded.index)\n",
"\n",
"# Оценка важности признаков\n",
"X = feature_matrix\n",
"y = train_df_encoded['HeartDisease']\n",
"\n",
"# Разделение данных на обучающую и тестовую выборки\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
"\n",
"# Обучение модели\n",
"model = RandomForestClassifier(n_estimators=100, random_state=42)\n",
"model.fit(X_train, y_train)\n",
"\n",
"# Получение важности признаков\n",
"importances = model.feature_importances_\n",
"feature_names = feature_matrix.columns\n",
"\n",
"# Сортировка признаков по важности\n",
"feature_importance = pd.DataFrame({'feature': feature_names, 'importance': importances})\n",
"feature_importance = feature_importance.sort_values(by='importance', ascending=False)\n",
"\n",
"print(\"Feature Importance:\")\n",
"print(feature_importance)"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Размер обучающей выборки: 15670\n",
"Размер контрольной выборки: 6716\n",
"Размер тестовой выборки: 9594\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:1733: UserWarning: index id not found in dataframe, creating new integer column\n",
" warnings.warn(\n",
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
" df = pd.concat([df, default_df], sort=True)\n",
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
" df = pd.concat([df, default_df], sort=True)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy: 1.0\n",
"Precision: 1.0\n",
"Recall: 1.0\n",
"F1 Score: 1.0\n",
"ROC AUC: 1.0\n",
"Cross-validated Accuracy: 0.906126356094448\n"
]
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train Accuracy: 0.9994894703254626\n",
"Train Precision: 0.9992816091954023\n",
"Train Recall: 0.9949928469241774\n",
"Train F1 Score: 0.9971326164874552\n",
"Train ROC AUC: 0.9974613898298016\n"
]
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score\n",
"from sklearn.model_selection import cross_val_score\n",
"import matplotlib.pyplot as plt\n",
"\n",
"# Уменьшение размера выборки для ускорения работы (опционально)\n",
"df = df.sample(frac=0.1, random_state=42)\n",
"\n",
"# Разделение на обучающую и тестовую выборки (например, 70% обучающая, 30% тестовая)\n",
"train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)\n",
"\n",
"# Разделение обучающей выборки на обучающую и контрольную (например, 70% обучающая, 30% контрольная)\n",
"train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)\n",
"\n",
"# Вывод размеров выборок\n",
"print(\"Размер обучающей выборки:\", len(train_df))\n",
"print(\"Размер контрольной выборки:\", len(val_df))\n",
"print(\"Размер тестовой выборки:\", len(test_df))\n",
"\n",
"# Определение категориальных признаков\n",
"categorical_features = [\n",
" 'Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'Sex', 'AgeCategory', 'Race',\n",
" 'Diabetic', 'PhysicalActivity', 'GenHealth', 'Asthma', 'KidneyDisease', 'SkinCancer'\n",
"]\n",
"\n",
"# Применение one-hot encoding к обучающей выборке\n",
"train_df_encoded = pd.get_dummies(train_df, columns=categorical_features)\n",
"\n",
"# Применение one-hot encoding к контрольной выборке\n",
"val_df_encoded = pd.get_dummies(val_df, columns=categorical_features)\n",
"\n",
"# Применение one-hot encoding к тестовой выборке\n",
"test_df_encoded = pd.get_dummies(test_df, columns=categorical_features)\n",
"\n",
"# Определение сущностей\n",
"es = ft.EntitySet(id='heart_data')\n",
"es = es.add_dataframe(dataframe_name='heart', dataframe=train_df_encoded, index='id')\n",
"\n",
"# Генерация признаков с уменьшенной глубиной\n",
"feature_matrix, feature_defs = ft.dfs(entityset=es, target_dataframe_name='heart', max_depth=1)\n",
"\n",
"# Преобразование признаков для контрольной и тестовой выборок\n",
"val_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=val_df_encoded.index)\n",
"test_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=test_df_encoded.index)\n",
"\n",
"# Удаление строк с NaN\n",
"feature_matrix = feature_matrix.dropna()\n",
"val_feature_matrix = val_feature_matrix.dropna()\n",
"test_feature_matrix = test_feature_matrix.dropna()\n",
"\n",
"# Разделение данных на обучающую и тестовую выборки\n",
"X_train = feature_matrix.drop('HeartDisease', axis=1)\n",
"y_train = feature_matrix['HeartDisease']\n",
"X_val = val_feature_matrix.drop('HeartDisease', axis=1)\n",
"y_val = val_feature_matrix['HeartDisease']\n",
"X_test = test_feature_matrix.drop('HeartDisease', axis=1)\n",
"y_test = test_feature_matrix['HeartDisease']\n",
"\n",
"# Выбор модели\n",
"model = RandomForestClassifier(random_state=42)\n",
"\n",
"# Обучение модели\n",
"model.fit(X_train, y_train)\n",
"\n",
"# Предсказание и оценка\n",
"y_pred = model.predict(X_test)\n",
"\n",
"accuracy = accuracy_score(y_test, y_pred)\n",
"precision = precision_score(y_test, y_pred)\n",
"recall = recall_score(y_test, y_pred)\n",
"f1 = f1_score(y_test, y_pred)\n",
"roc_auc = roc_auc_score(y_test, y_pred)\n",
"\n",
"print(f\"Accuracy: {accuracy}\")\n",
"print(f\"Precision: {precision}\")\n",
"print(f\"Recall: {recall}\")\n",
"print(f\"F1 Score: {f1}\")\n",
"print(f\"ROC AUC: {roc_auc}\")\n",
"\n",
"# Кросс-валидация\n",
"scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')\n",
"accuracy_cv = scores.mean()\n",
"print(f\"Cross-validated Accuracy: {accuracy_cv}\")\n",
"\n",
"# Анализ важности признаков\n",
"feature_importances = model.feature_importances_\n",
"feature_names = X_train.columns\n",
"\n",
"importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})\n",
"importance_df = importance_df.sort_values(by='Importance', ascending=False)\n",
"\n",
"plt.figure(figsize=(10, 6))\n",
"sns.barplot(x='Importance', y='Feature', data=importance_df)\n",
"plt.title('Feature Importance')\n",
"plt.show()\n",
"\n",
"# Проверка на переобучение\n",
"y_train_pred = model.predict(X_train)\n",
"\n",
"accuracy_train = accuracy_score(y_train, y_train_pred)\n",
"precision_train = precision_score(y_train, y_train_pred)\n",
"recall_train = recall_score(y_train, y_train_pred)\n",
"f1_train = f1_score(y_train, y_train_pred)\n",
"roc_auc_train = roc_auc_score(y_train, y_train_pred)\n",
"\n",
"print(f\"Train Accuracy: {accuracy_train}\")\n",
"print(f\"Train Precision: {precision_train}\")\n",
"print(f\"Train Recall: {recall_train}\")\n",
"print(f\"Train F1 Score: {f1_train}\")\n",
"print(f\"Train ROC AUC: {roc_auc_train}\")\n",
"\n",
"# Визуализация результатов\n",
"plt.figure(figsize=(10, 6))\n",
"plt.scatter(y_test, y_pred, alpha=0.5)\n",
"plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)\n",
"plt.xlabel('Actual HeartDisease')\n",
"plt.ylabel('Predicted HeartDisease')\n",
"plt.title('Actual vs Predicted HeartDisease')\n",
"plt.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}