From beb0ee0ab82ed82534d8fc1a9b79c8fb3dc18a09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9F=D0=BE=D0=BB=D0=B8=D0=BD=D0=B0=20=D0=A7=D1=83=D0=B1?= =?UTF-8?q?=D1=8B=D0=BA=D0=B8=D0=BD=D0=B0?= Date: Sat, 9 Nov 2024 13:27:56 +0400 Subject: [PATCH] =?UTF-8?q?=D0=B5=D1=89=D0=B5=20=D0=B8=D0=B7=D0=BC=D0=B5?= =?UTF-8?q?=D0=BD=D0=B5=D0=BD=D0=B8=D1=8F=20=D0=BD=D0=B5=20=D0=B2=D1=81?= =?UTF-8?q?=D0=B5=20=D0=B7=D0=B0=D0=BB=D0=B8=D0=BB=D0=BE=D1=81=D1=8C=20?= =?UTF-8?q?=D0=BF=D0=BE=D1=87=D0=B5=D0=BC=D1=83=20=D1=82=D0=BE=3F=3F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lab_4/lab4.ipynb | 123 ++++++++++++++++++++++------------------------- 1 file changed, 58 insertions(+), 65 deletions(-) diff --git a/lab_4/lab4.ipynb b/lab_4/lab4.ipynb index a5fa605..e0eabee 100644 --- a/lab_4/lab4.ipynb +++ b/lab_4/lab4.ipynb @@ -586,39 +586,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Проведем деление на выборки и создание ориентира" + "Проведем деление на выборки" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Размер обучающей выборки: (743, 16)\n", - "Размер тестовой выборки: (186, 16)\n" - ] - } - ], - "source": [ - "import pandas as pd\n", - "from sklearn.model_selection import train_test_split\n", - "\n", - "features = ['BMI', 'Smoking', 'AlcoholDrinking', 'Stroke', 'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex', 'AgeCategory', 'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'Asthma', 'KidneyDisease', 'SkinCancer']\n", - "target = 'HeartDisease'\n", - "\n", - "X_train, X_test, y_train, y_test = train_test_split(data[features], data[target], test_size=0.2, random_state=42)\n", - "\n", - "print(\"Размер обучающей выборки:\", X_train.shape)\n", - "print(\"Размер тестовой выборки:\", X_test.shape)" - ] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 63, "metadata": {}, "outputs": [ { @@ -630,7 +603,60 @@ "1 796\n", "Name: count, dtype: int64\n", "Размер обучающей выборки: (1273, 49)\n", - "Размер тестовой выборки: (319, 49)\n", + "Размер тестовой выборки: (319, 49)\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "features = ['BMI', 'Smoking', 'AlcoholDrinking', 'Stroke', 'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex', 'AgeCategory', 'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'Asthma', 'KidneyDisease', 'SkinCancer']\n", + "target = 'HeartDisease'\n", + "\n", + "label_encoder = LabelEncoder()\n", + "data[target] = label_encoder.fit_transform(data[target])\n", + "\n", + "categorical_features = ['Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'Sex', 'AgeCategory', 'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'Asthma', 'KidneyDisease', 'SkinCancer']\n", + "numeric_features = ['BMI', 'PhysicalHealth', 'MentalHealth']\n", + "\n", + "categorical_transformer = Pipeline(steps=[\n", + " ('onehot', OneHotEncoder(handle_unknown='ignore'))\n", + "])\n", + "\n", + "numeric_transformer = Pipeline(steps=[\n", + " ('scaler', StandardScaler())\n", + "])\n", + "\n", + "preprocessor = ColumnTransformer(\n", + " transformers=[\n", + " ('num', numeric_transformer, numeric_features),\n", + " ('cat', categorical_transformer, categorical_features)\n", + " ])\n", + "\n", + "X = preprocessor.fit_transform(data[features])\n", + "y = data[target]\n", + "\n", + "smote = SMOTE(random_state=42)\n", + "X_resampled, y_resampled = smote.fit_resample(X, y)\n", + "\n", + "print(pd.Series(y_resampled).value_counts())\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)\n", + "\n", + "print(\"Размер обучающей выборки:\", X_train.shape)\n", + "print(\"Размер тестовой выборки:\", X_test.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ "Лучшие гиперпараметры для логистической регрессии:\n", "{'classifier__C': np.float64(0.7272998688284025), 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear'}\n", "Accuracy: 0.7398\n", @@ -639,14 +665,14 @@ "F1-Score: 0.7398\n", "ROC-AUC: 0.8338\n", "Лучшие гиперпараметры для случайного леса:\n", - "{'classifier__bootstrap': True, 'classifier__max_depth': np.int64(32), 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 6, 'classifier__n_estimators': 317}\n", + "{'classifier__bootstrap': True, 'classifier__max_depth': np.int64(25), 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 6, 'classifier__n_estimators': 317}\n", "Accuracy: 0.9122\n", "Precision: 0.9571\n", "Recall: 0.8590\n", "F1-Score: 0.9054\n", "ROC-AUC: 0.9773\n", "Лучшие гиперпараметры для градиентного бустинга:\n", - "{'classifier__learning_rate': np.float64(0.17269984907963387), 'classifier__max_depth': np.int64(96), 'classifier__min_samples_leaf': 8, 'classifier__min_samples_split': 8, 'classifier__n_estimators': 294, 'classifier__subsample': np.float64(0.8288064461501716)}\n", + "{'classifier__learning_rate': np.float64(0.17269984907963387), 'classifier__max_depth': np.int64(52), 'classifier__min_samples_leaf': 8, 'classifier__min_samples_split': 8, 'classifier__n_estimators': 294, 'classifier__subsample': np.float64(0.8288064461501716)}\n", "Accuracy: 0.9185\n", "Precision: 0.9577\n", "Recall: 0.8718\n", @@ -692,39 +718,6 @@ "from scipy.stats import uniform, randint\n", "from sklearn.model_selection import RandomizedSearchCV\n", "\n", - "features = ['BMI', 'Smoking', 'AlcoholDrinking', 'Stroke', 'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex', 'AgeCategory', 'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'Asthma', 'KidneyDisease', 'SkinCancer']\n", - "target = 'HeartDisease'\n", - "\n", - "label_encoder = LabelEncoder()\n", - "data[target] = label_encoder.fit_transform(data[target])\n", - "\n", - "categorical_features = ['Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'Sex', 'AgeCategory', 'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'Asthma', 'KidneyDisease', 'SkinCancer']\n", - "numeric_features = ['BMI', 'PhysicalHealth', 'MentalHealth']\n", - "\n", - "categorical_transformer = Pipeline(steps=[\n", - " ('onehot', OneHotEncoder(handle_unknown='ignore'))\n", - "])\n", - "\n", - "numeric_transformer = Pipeline(steps=[\n", - " ('scaler', StandardScaler())\n", - "])\n", - "\n", - "preprocessor = ColumnTransformer(\n", - " transformers=[\n", - " ('num', numeric_transformer, numeric_features),\n", - " ('cat', categorical_transformer, categorical_features)\n", - " ])\n", - "\n", - "X = preprocessor.fit_transform(data[features])\n", - "y = data[target]\n", - "\n", - "smote = SMOTE(random_state=42)\n", - "X_resampled, y_resampled = smote.fit_resample(X, y)\n", - "\n", - "print(pd.Series(y_resampled).value_counts())\n", - "\n", - "X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)\n", - "\n", "def evaluate_model(model, X_test, y_test):\n", " y_pred = model.predict(X_test)\n", " y_pred_proba = model.predict_proba(X_test)[:, 1]\n",