{ "cells": [ { "metadata": { "ExecuteTime": { "end_time": "2024-12-24T15:40:10.929503Z", "start_time": "2024-12-24T15:40:08.889350Z" } }, "cell_type": "code", "source": [ "from sklearn.utils import resample\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "from sklearn.preprocessing import LabelEncoder\n", "from sklearn import metrics\n", "from imblearn.over_sampling import RandomOverSampler\n", "from imblearn.under_sampling import RandomUnderSampler\n", "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n", "from sklearn.metrics import ConfusionMatrixDisplay\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.linear_model import LinearRegression, LogisticRegression\n", "from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier\n", "from sklearn.model_selection import train_test_split, GridSearchCV\n", "from sklearn.linear_model import SGDClassifier, SGDRegressor\n", "from sklearn.metrics import (\n", " precision_score, recall_score, accuracy_score, roc_auc_score, f1_score,\n", " matthews_corrcoef, cohen_kappa_score, confusion_matrix\n", ")\n", "from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error\n", "import numpy as np\n", "import featuretools as ft\n", "from sklearn.metrics import accuracy_score, classification_report\n", "\n", "df = pd.read_csv(\"healthcare-dataset-stroke-data.csv\")\n", "\n", "# Обработка пропущенных значений\n", "df[\"bmi\"] = df[\"bmi\"].fillna(df[\"bmi\"].median())\n", "\n", "# Удаление или замена неизвестных категорий\n", "# Например, замена 'Other' на 'Unknown' в столбце 'gender'\n", "df['gender'] = df['gender'].replace('Other', 'Unknown')\n", "\n", "# Разделяем классы\n", "stroke_0 = df[df['stroke'] == 0]\n", "stroke_1 = df[df['stroke'] == 1]\n", "\n", "# Увеличиваем выборку для 1\n", "stroke_1 = resample(stroke_1, replace=True, n_samples=len(stroke_0), random_state=42)\n", "\n", "# Объединяем классы\n", "df = pd.concat([stroke_0, stroke_1])\n", "\n", "df.info()" ], "id": "db3cd8711d083df", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Index: 9722 entries, 249 to 134\n", "Data columns (total 12 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 id 9722 non-null int64 \n", " 1 gender 9722 non-null object \n", " 2 age 9722 non-null float64\n", " 3 hypertension 9722 non-null int64 \n", " 4 heart_disease 9722 non-null int64 \n", " 5 ever_married 9722 non-null object \n", " 6 work_type 9722 non-null object \n", " 7 Residence_type 9722 non-null object \n", " 8 avg_glucose_level 9722 non-null float64\n", " 9 bmi 9722 non-null float64\n", " 10 smoking_status 9722 non-null object \n", " 11 stroke 9722 non-null int64 \n", "dtypes: float64(3), int64(4), object(5)\n", "memory usage: 987.4+ KB\n" ] } ], "execution_count": 1 }, { "metadata": {}, "cell_type": "markdown", "source": [ "Классификация: Предсказать вероятность инсульта на основе данных пациента.\n", "\n", "Регрессия: Предсказать уровень глюкозы в крови на основе данных пациента." ], "id": "d5f640ac158b69c5" }, { "metadata": { "ExecuteTime": { "end_time": "2024-12-24T15:40:11.000561Z", "start_time": "2024-12-24T15:40:10.979999Z" } }, "cell_type": "code", "source": [ "# Определение целевых переменных\n", "X = df.drop('stroke', axis=1)\n", "y_class = df['stroke'] # Задача классификации\n", "y_reg = df['avg_glucose_level'] # Задача регрессии\n", "\n", "# Преобразование категориальных переменных\n", "categorical_features = ['gender', 'ever_married', 'smoking_status']\n", "numerical_features = ['age', 'avg_glucose_level', 'bmi']\n", "\n", "# Создание ColumnTransformer с обработкой неизвестных категорий\n", "preprocessor = ColumnTransformer(\n", " transformers=[\n", " ('num', StandardScaler(), numerical_features),\n", " ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)]) # Используем handle_unknown='ignore'\n", "\n", "# Разделение данных на обучающую и тестовую выборки\n", "X_train, X_test, y_class_train, y_class_test, y_reg_train, y_reg_test = train_test_split(X, y_class, y_reg, test_size=0.2, random_state=42) \n", "\n", "def estimate_bias_variance(model, X, y):\n", " predictions = np.array([model.fit(X, y).predict(X) for _ in range(1000)])\n", " bias = np.mean((y - np.mean(predictions, axis=0)) ** 2)\n", " variance = np.mean(np.var(predictions, axis=0))\n", " return bias, variance" ], "id": "55c612d9a4b90a55", "outputs": [], "execution_count": 2 }, { "metadata": {}, "cell_type": "markdown", "source": "Классификация", "id": "273d7304e338f532" }, { "metadata": { "ExecuteTime": { "end_time": "2024-12-24T15:40:50.317809Z", "start_time": "2024-12-24T15:40:11.011074Z" } }, "cell_type": "code", "source": [ "# Задача классификации\n", "class_pipeline_rf = Pipeline(steps=[\n", " ('preprocessor', preprocessor),\n", " ('classifier', RandomForestClassifier(random_state=42))])\n", "\n", "class_pipeline_sgd = Pipeline(steps=[\n", " ('preprocessor', preprocessor),\n", " ('classifier', SGDClassifier(loss='log_loss', penalty='l2', random_state=42, max_iter=2000))]) \n", "\n", "# Настройка гиперпараметров\n", "param_grid_class_rf = {\n", " 'classifier__n_estimators': [100, 200],\n", " 'classifier__max_depth': [None, 10, 20]}\n", "\n", "param_grid_class_sgd = {\n", " 'classifier__alpha': [0.0001, 0.001, 0.01],\n", " 'classifier__learning_rate': ['constant', 'adaptive'],\n", " 'classifier__eta0': [0.01, 0.1]}\n", "\n", "# Поиск гиперпараметров\n", "grid_search_class_rf = GridSearchCV(class_pipeline_rf, param_grid_class_rf, cv=5, scoring='accuracy')\n", "grid_search_class_rf.fit(X_train, y_class_train)\n", "\n", "grid_search_class_sgd = GridSearchCV(class_pipeline_sgd, param_grid_class_sgd, cv=5, scoring='accuracy')\n", "grid_search_class_sgd.fit(X_train, y_class_train)" ], "id": "956c94b392572508", "outputs": [ { "data": { "text/plain": [ "GridSearchCV(cv=5,\n", " estimator=Pipeline(steps=[('preprocessor',\n", " ColumnTransformer(transformers=[('num',\n", " StandardScaler(),\n", " ['age',\n", " 'avg_glucose_level',\n", " 'bmi']),\n", " ('cat',\n", " OneHotEncoder(handle_unknown='ignore'),\n", " ['gender',\n", " 'ever_married',\n", " 'smoking_status'])])),\n", " ('classifier',\n", " SGDClassifier(loss='log_loss',\n", " max_iter=2000,\n", " random_state=42))]),\n", " param_grid={'classifier__alpha': [0.0001, 0.001, 0.01],\n", " 'classifier__eta0': [0.01, 0.1],\n", " 'classifier__learning_rate': ['constant', 'adaptive']},\n", " scoring='accuracy')" ], "text/html": [ "
df = pd.read_csv("healthcare-dataset-stroke-data.csv")

# Обработка пропущенных значений
df["bmi"] = df["bmi"].fillna(df["bmi"].median())

# Удаление или замена неизвестных категорий
# Например, замена 'Other' на 'Unknown' в столбце 'gender'
df['gender'] = df['gender'].replace('Other', 'Unknown')

# Разделяем классы
stroke_0 = df[df['stroke'] == 0]
stroke_1 = df[df['stroke'] == 1]

# Увеличиваем выборку для 1
stroke_1 = resample(stroke_1, replace=True, n_samples=len(stroke_0), random_state=42)

# Объединяем классы
df = pd.concat([stroke_0, stroke_1])

df = df.sample(frac=0.4)

df.info() Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 id 3889 non-null int64 \n", " 1 gender 3889 non-null object \n", " 2 age 3889 non-null float64\n", " 3 hypertension 3889 non-null int64 \n", " 4 heart_disease 3889 non-null int64 \n", " 5 ever_married 3889 non-null object \n", " 6 work_type 3889 non-null object \n", " 7 Residence_type 3889 non-null object \n", " 8 avg_glucose_level 3889 non-null float64\n", " 9 bmi 3889 non-null float64\n", " 10 smoking_status 3889 non-null object \n", " 11 stroke 3889 non-null int64 \n", "dtypes: float64(3), int64(4), object(5)\n", "memory usage: 395.0+ KB\n" ] } ], "execution_count": 5 }, { "metadata": { "ExecuteTime": { "end_time": "2024-12-24T18:05:32.603683Z", "start_time": "2024-12-24T18:05:32.596873Z" } }, "cell_type": "code", "source": [ "# Определение целевых переменных\n", "X = df.drop('stroke', axis=1)\n", "y_class = df['stroke'] # Задача классификации\n", "y_reg = df['avg_glucose_level'] # Задача регрессии\n", "\n", "# Преобразование категориальных переменных\n", "categorical_features = ['gender', 'ever_married', 'smoking_status']\n", "numerical_features = ['age', 'avg_glucose_level', 'bmi']\n", "\n", "# Создание ColumnTransformer с обработкой неизвестных категорий\n", "preprocessor = ColumnTransformer(\n", " transformers=[\n", " ('num', StandardScaler(), numerical_features),\n", " ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)]) # Используем handle_unknown='ignore'\n", "\n", "# Разделение данных на обучающую и тестовую выборки\n", "X_train, X_test, y_class_train, y_class_test, y_reg_train, y_reg_test = train_test_split(X, y_class, y_reg, test_size=0.2, random_state=42) \n", "\n", "def estimate_bias_variance(model, X, y):\n", " predictions = np.array([model.fit(X, y).predict(X) for _ in range(1000)])\n", " bias = np.mean((y - np.mean(predictions, axis=0)) ** 2)\n", " variance = np.mean(np.var(predictions, axis=0))\n", " return bias, variance" ], "id": "45df888b49839959", "outputs": [], "execution_count": 6 }, { "metadata": { "ExecuteTime": { "end_time": "2024-12-24T18:06:01.805307Z", "start_time": "2024-12-24T18:05:38.651793Z" } }, "cell_type": "code", "source": [ "# Задача регрессии\n", "reg_pipeline_rf = Pipeline(steps=[\n", " ('preprocessor', preprocessor),\n", " ('regressor', RandomForestRegressor(random_state=42))])\n", "\n", "reg_pipeline_sgd = Pipeline(steps=[\n", " ('preprocessor', preprocessor),\n", " ('regressor', SGDRegressor(loss='squared_error', penalty='l2', random_state=42, max_iter=2000))])\n", "\n", "# Настройка гиперпараметров для регрессии\n", "param_grid_reg_rf = {\n", " 'regressor__n_estimators': [100, 200],\n", " 'regressor__max_depth': [None, 10, 20]}\n", "\n", "param_grid_reg_sgd = {\n", " 'regressor__alpha': [0.0001, 0.001, 0.01],\n", " 'regressor__learning_rate': ['constant', 'adaptive'],\n", " 'regressor__eta0': [0.01, 0.1]}\n", "\n", "# Поиск гиперпараметров\n", "grid_search_reg_rf = GridSearchCV(reg_pipeline_rf, param_grid_reg_rf, cv=5, scoring='r2')\n", "grid_search_reg_rf.fit(X_train, y_reg_train)\n", "\n", "grid_search_reg_sgd = GridSearchCV(reg_pipeline_sgd, param_grid_reg_sgd, cv=5, scoring='r2')\n", "grid_search_reg_sgd.fit(X_train, y_reg_train)" ], "id": "ee19135bbfc42564", "outputs": [ { "data": { "text/plain": [ "GridSearchCV(cv=5,\n", " estimator=Pipeline(steps=[('preprocessor',\n", " ColumnTransformer(transformers=[('num',\n", " StandardScaler(),\n", " ['age',\n", " 'avg_glucose_level',\n", " 'bmi']),\n", " ('cat',\n", " OneHotEncoder(handle_unknown='ignore'),\n", " ['gender',\n", " 'ever_married',\n", " 'smoking_status'])])),\n", " ('regressor',\n", " SGDRegressor(max_iter=2000,\n", " random_state=42))]),\n", " param_grid={'regressor__alpha': [0.0001, 0.001, 0.01],\n", " 'regressor__eta0': [0.01, 0.1],\n", " 'regressor__learning_rate': ['constant', 'adaptive']},\n", " scoring='r2')" ], "text/html": [ "
" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 7 }, { "metadata": { "ExecuteTime": { "end_time": "2024-12-24T18:06:06.657058Z", "start_time": "2024-12-24T18:06:06.621503Z" } }, "cell_type": "code", "source": [ "# Оценка моделей\n", "y_reg_pred_rf = grid_search_reg_rf.predict(X_test)\n", "y_reg_pred_sgd = grid_search_reg_sgd.predict(X_test)\n", "\n", "print(\"Regression Metrics for Random Forest:\")\n", "print(\"Mean Squared Error:\", mean_squared_error(y_reg_test, y_reg_pred_rf))\n", "print(\"R2 Score:\", r2_score(y_reg_test, y_reg_pred_rf))\n", "\n", "print(\"Regression Metrics for SGD:\")\n", "print(\"Mean Squared Error:\", mean_squared_error(y_reg_test, y_reg_pred_sgd))\n", "print(\"R2 Score:\", r2_score(y_reg_test, y_reg_pred_sgd))" ], "id": "65f8397fab8911ba", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Regression Metrics for Random Forest:\n", "Mean Squared Error: 0.020541479842551013\n", "R2 Score: 0.9999932939237861\n", "Regression Metrics for SGD:\n", "Mean Squared Error: 4.648382893338219e-05\n", "R2 Score: 0.9999999848246522\n" ] } ], "execution_count": 8 }, { "metadata": { "ExecuteTime": { "end_time": "2024-12-24T18:28:20.340519Z", "start_time": "2024-12-24T18:06:12.807767Z" } }, "cell_type": "code", "source": [ "# Оценка смещения и дисперсии\n", "bias_reg_rf, variance_reg_rf = estimate_bias_variance(grid_search_reg_rf.best_estimator_, X_train, y_reg_train)\n", "bias_reg_sgd, variance_reg_sgd = estimate_bias_variance(grid_search_reg_sgd.best_estimator_, X_train, y_reg_train)\n", "\n", "print(\"Regression Bias (Random Forest):\", bias_reg_rf)\n", "print(\"Regression Variance (Random Forest):\", variance_reg_rf)\n", "print(\"Regression Bias (SGD):\", bias_reg_sgd)\n", "print(\"Regression Variance (SGD):\", variance_reg_sgd)" ], "id": "cccd002e6275411f", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Regression Bias (Random Forest): 0.0022602929210874885\n", "Regression Variance (Random Forest): 3.608883047891326e-24\n", "Regression Bias (SGD): 4.682701837803326e-05\n", "Regression Variance (SGD): 3.2443460449162085e-24\n" ] } ], "execution_count": 9 } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 5 }