1102 lines
364 KiB
Plaintext
1102 lines
364 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# Лабораторная работа 3."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 40,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Index(['HeartDisease', 'BMI', 'Smoking', 'AlcoholDrinking', 'Stroke',\n",
|
|||
|
" 'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex', 'AgeCategory',\n",
|
|||
|
" 'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'SleepTime',\n",
|
|||
|
" 'Asthma', 'KidneyDisease', 'SkinCancer'],\n",
|
|||
|
" dtype='object')\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"df = pd.read_csv(\"./datasets/var2/2020/heart_2020_cleaned.csv\")\n",
|
|||
|
"print(df.columns)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Бизнес-цели:\n",
|
|||
|
"- Разработка персонализированных программ профилактики сердечно-сосудистых заболеваний. Цель технического проекта: создание модели машинного обучения, которая будет прогнозировать риск сердечного приступа для каждого пациента на основе его индивидуальных факторов риска, и разработка онлайн-платформы или приложения для предоставления персонализированных рекомендаций по профилактике.\n",
|
|||
|
"- Улучшение качества медицинской Цель технического проекта: использование данных для выявления групп населения с наибольшим риском сердечного приступа и разработки целевых программ профилактики и раннего выявления заболеваний.\n",
|
|||
|
"\n",
|
|||
|
"#### Выполним разбиение на 3 выборки: обучающую, контрольную и тестовую"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 41,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки: 156699\n",
|
|||
|
"Размер контрольной выборки: 67157\n",
|
|||
|
"Размер тестовой выборки: 95939\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"\n",
|
|||
|
"df = pd.read_csv(\"./datasets/var2/2020/heart_2020_cleaned.csv\")\n",
|
|||
|
"\n",
|
|||
|
"# Разделение на обучающую и тестовую выборки (например, 70% обучающая, 30% тестовая)\n",
|
|||
|
"train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение обучающей выборки на обучающую и контрольную (например, 70% обучающая, 30% контрольная)\n",
|
|||
|
"train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Вывод размеров выборок\n",
|
|||
|
"print(\"Размер обучающей выборки:\", len(train_df))\n",
|
|||
|
"print(\"Размер контрольной выборки:\", len(val_df))\n",
|
|||
|
"print(\"Размер тестовой выборки:\", len(test_df))\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 42,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение классов в HeartDisease:\n",
|
|||
|
"HeartDisease\n",
|
|||
|
"No 292422\n",
|
|||
|
"Yes 27373\n",
|
|||
|
"Name: count, dtype: int64\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAlUAAAHHCAYAAACWQK1nAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABJlUlEQVR4nO3deVgVdf//8dcBZVE8oCIgiUuaO2niEpm4RKJS3ablkne5ZnWjpZSad4b7TeXtlku2fBMrLbVSSxM13JU0KdxSM8NbS0FcACUFhfn90cX8PIKKOoro83Fd57o8M+/5zHuGwzkvZ+YMNsMwDAEAAOCGOBV1AwAAAHcCQhUAAIAFCFUAAAAWIFQBAABYgFAFAABgAUIVAACABQhVAAAAFiBUAQAAWIBQBQAAYAFCFQAAVxETEyObzaaDBw8WdSu4jRGqUCzkvaHlPdzc3FSzZk0NGDBAKSkpRd0ecFcaNWqUbDabjh8/XuD8qlWr6rHHHrvFXf1/M2fOVExMTL7pa9eudXg/cXV1la+vr1q1aqX//Oc/Sk1NvfXN4o5QoqgbAK7FmDFjVK1aNZ07d04bN27Ue++9p++++067du1SqVKliro9ALeRmTNnytvbW7169Spw/ssvv6wmTZooJydHqamp2rx5s0aOHKlJkyZpwYIFatOmjVn77LPPqlu3bnJ1db1F3aM4IlShWGnfvr0aN24sSerXr5/Kly+vSZMmacmSJerevXsRdwfgdvDXX38V6j9ZLVq00FNPPeUwbfv27Wrbtq06d+6sX375RRUrVpQkOTs7y9nZ+ab0izsHp/9QrOX9TzIpKUmSdPLkSb322msKDAyUh4eH7Ha72rdvr+3bt+db9ty5cxo1apRq1qwpNzc3VaxYUZ06ddKBAwckSQcPHnQ4RXDpo1WrVuZYeacT5s+fr3//+9/y8/NT6dKl9cQTT+jw4cP51r1lyxa1a9dOnp6eKlWqlFq2bKlNmzYVuI2tWrUqcP2jRo3KV/vZZ58pKChI7u7uKleunLp161bg+q+0bRfLzc3VlClTVK9ePbm5ucnX11cvvPCCTp065VB3udM8AwYMyDdmQb1PmDAh3z6VpKysLI0cOVI1atSQq6urAgICNHToUGVlZRW4ry7WqlWrfOONHz9eTk5Omjdv3nXtj//+97966KGHVL58ebm7uysoKEhffvllgev/7LPP1LRpU5UqVUply5ZVSEiIVq5c6VCzfPlytWzZUmXKlJHdbleTJk3y9bZw4ULzZ+rt7a1//vOf+vPPPx1qevXq5dBz2bJl1apVK23YsOGq++lGlr0ehX1NLVmyROHh4fL395erq6uqV6+usWPHKicnx6GuVatWql+/vhISEhQSEqJSpUrp3//+t6pWrardu3dr3bp1Bf7OXk6DBg00ZcoUpaWlafr06eb0gq6p2rZtm8LCwuTt7S13d3dVq1ZNffr0uanbu3//fnXu3Fl+fn5yc3NTpUqV1K1bN6WnpzvUFfa9ANbiSBWKtbwAVL58eUnS77//rsWLF+vpp59WtWrVlJKSovfff18tW7bUL7/8In9/f0lSTk6OHnvsMcXFxalbt2565ZVXdPr0aa1atUq7du1S9erVzXV0795dHTp0cFjv8OHDC+xn/PjxstlsGjZsmI4dO6YpU6YoNDRUiYmJcnd3lyStXr1a7du3V1BQkEaOHCknJyfNnj1bbdq00YYNG9S0adN841aqVEnR0dGSpDNnzuill14qcN1vvvmmunTpon79+ik1NVXTpk1TSEiIfv75Z3l5eeVbpn///mrRooUk6euvv9aiRYsc5r/wwguKiYlR79699fLLLyspKUnTp0/Xzz//rE2bNqlkyZIF7odrkZaWZm7bxXJzc/XEE09o48aN6t+/v+rUqaOdO3dq8uTJ+vXXX7V48eJrWs/s2bM1YsQITZw4Uc8880yBNVfbH1OnTtUTTzyhHj16KDs7W1988YWefvppLV26VOHh4Wbd6NGjNWrUKD300EMaM2aMXFxctGXLFq1evVpt27aV9PeHdJ8+fVSvXj0NHz5cXl5e+vnnnxUbG2v2l7fvmzRpoujoaKWkpGjq1KnatGlTvp+pt7e3Jk+eLEn6448/NHXqVHXo0EGHDx8u8Gd/sRtZVvr7PzMFyc3NzTetsK+pmJgYeXh4KDIyUh4eHlq9erWioqKUkZGhCRMmOIx54sQJtW/fXt26ddM///lP8/qogQMHysPDQ2+88YYkydfX96rbIklPPfWU+vbtq5UrV2r8+PEF1hw7dkxt27ZVhQoV9Prrr8vLy0sHDx7U119/fdO2Nzs7W2FhYcrKytLAgQPl5+enP//8U0uXLlVaWpo8PT0lXd97ASxiAMXA7NmzDUnG999/b6SmphqHDx82vvjiC6N8+fKGu7u78ccffxiGYRjnzp0zcnJyHJZNSkoyXF1djTFjxpjTPv74Y0OSMWnSpHzrys3NNZeTZEyYMCFfTb169YyWLVuaz9esWWNIMu655x4jIyPDnL5gwQJDkjF16lRz7Pvuu88ICwsz12MYhvHXX38Z1apVMx599NF863rooYeM+vXrm89TU1MNScbIkSPNaQcPHjScnZ2N8ePHOyy7c+dOo0SJEvmm79+/35BkzJkzx5w2cuRI4+K3hA0bNhiSjLlz5zosGxsbm296lSpVjPDw8Hy9R0REGJe+zVza+9ChQw0fHx8jKCjIYZ9++umnhpOTk7FhwwaH5WfNmmVIMjZt2pRvfRdr2bKlOd6yZcuMEiVKGK+++mqBtYXZH4bx98/pYtnZ2Ub9+vWNNm3aOIzl5ORkPPnkk/lei3k/87S0NKNMmTJGs2bNjLNnzxZYk52dbfj4+Bj169d3qFm6dKkhyYiKijKn9ezZ06hSpYrDOB988IEhydi6dWuB22zFsnn76EqPi18X1/KaunRfG4ZhvPDCC0apUqWMc+fOmdNatmxpSDJmzZqVr/7S39M8eb+vCxcuvOy2NWjQwChbtqz5PO89KCkpyTAMw1i0aJEhyfjxxx8vO4bV2/vzzz9fte9rfS+AtTj9h2IlNDRUFSpUUEBAgLp16yYPDw8tWrRI99xzjyTJ1dVVTk5/v6xzcnJ04sQJeXh4qFatWvrpp5/Mcb766it5e3tr4MCB+dZx6Smfa/Hcc8+pTJky5vOnnnpKFStW1HfffSdJSkxM1P79+/XMM8/oxIkTOn78uI4fP67MzEw98sgjWr9+fb7/3Z87d05ubm5XXO/XX3+t3NxcdenSxRzz+PHj8vPz03333ac1a9Y41GdnZ0vSFS+6XbhwoTw9PfXoo486jBkUFCQPD498Y54/f96h7vjx4zp37twV+/7zzz81bdo0vfnmm/Lw8Mi3/jp16qh27doOY+ad8r10/ZezdetWdenSRZ07d853hCNPYfaHJPNooySdOnVK6enpatGihcNra/HixcrNzVVUVJT5WsyT99patWqVTp8+rddffz3fzzavZtu2bTp27Jj+9a9/OdSEh4erdu3aWrZsmcNyubm55j5KTEzUJ598oooVK6pOnTpX3KYbXVb6+/dp1apV+R6XHhm6ltfUxfv69OnTOn78uFq0aKG//vpLe/fudRjX1dVVvXv3LlSvheXh4aHTp09fdn7e0Z6lS5fq/PnzBdZYvb15R6JWrFihv/76q8B1Xut7AazF6T8UKzNmzFDNmjVVokQJ+fr6qlatWg4fXLm5uZo6dapmzpyppKQkh+sR8k4RSn+fNqxVq5ZKlLD2V+C+++5zeG6z2VSjRg3zOoz9+/dLknr27HnZMdLT01W2bFnz+fHjx/ONe6n9+/fLMIzL1l16mi4tLU2S8gWZS8dMT0+Xj49PgfOPHTvm8HzlypWqUKHCFfu81MiRI+Xv768XXngh37VJ+/fv1549ey475qXrL8iff/6p8PBwZWZm6sSJE5cNzIXZH9LfH6Djxo1TYmKiw3VdF4974MABOTk5qW7dupcdJ++0df369S9b87///U+SVKtWrXzzateurY0bNzpMO3z4sMO+qlixor766qurbtONLitJISEh8vb2zjf90sB4La+p3bt
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение классов в Обучающей выборке:\n",
|
|||
|
"HeartDisease\n",
|
|||
|
"No 143331\n",
|
|||
|
"Yes 13368\n",
|
|||
|
"Name: count, dtype: int64\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAlUAAAHHCAYAAACWQK1nAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABUlUlEQVR4nO3deVQV9f8/8Oe9IIvABTeWW6jkhgtqoCGiuJGoaF/SVJRcSazANFypxLVI/Jj7XomVlpIf0VxQwgVTQkFRMUXyg1t6EUVAMBZhfn947vwY70URRwF9Ps6553hnXvOe18y9F57OzB0UgiAIICIiIqJnoqzqBoiIiIheBgxVRERERDJgqCIiIiKSAUMVERERkQwYqoiIiIhkwFBFREREJAOGKiIiIiIZMFQRERERyYChiojoFXPv3j1cvnwZ+fn5Vd0KySw7Oxt///03Hjx4UNWtvJIYqoiIXnKCIGDdunXo1KkTateuDZVKBQcHB/z0009V3VqNcP36dURERIjPL1++jE2bNlVdQ2UUFxcjPDwc7dq1g7GxMerUqYNmzZohNja2qlt7JTFUkV4RERFQKBTiw8TEBM2bN0dQUBAyMjKquj2iV9rRo0fx7rvvwsbGBsbGxmjcuDHGjx+Pq1ev6q0fPnw4PvzwQ7Rs2RI//vgjYmJi8Pvvv2PgwIEvuPOaSaFQIDAwEPv27cPly5cxbdo0HDlypKrbQmFhITw9PTFz5kx0794dkZGRiImJwYEDB+Dm5lbV7b2SDKu6Aare5s6dCwcHBxQUFOCPP/7A6tWrsWfPHqSkpKB27dpV3R7RK2f58uWYOHEi3njjDUyYMAF2dnY4f/48vv32W2zZsgV79uxB586dxfoffvgBW7ZswU8//YThw4dXYec112uvvYZx48ahT58+AAA7OzscOnSoapsCsGDBAiQkJGDfvn3o3r17VbdDABT8g8qkT0REBMaMGYMTJ06gQ4cO4vTJkyfjm2++webNmzFs2LAq7JDo1XP06FF4eHjA3d0d0dHRkv/YXLp0Ce7u7lAqlTh37hzq1KkDAHByckLbtm2rzemqmuzSpUu4ffs22rRpAzMzsyrt5cGDB7C2tsZHH32EL7/8skp7of+Pp//oqfTs2RMAkJ6eDgDIysrClClT4OTkBHNzc6hUKvTt2xenT5/WWbagoACzZ89G8+bNYWJiAjs7OwwcOBCXLl0C8PA6hbKnHB99lP2f2KFDh6BQKLBlyxZ89tlnsLW1hZmZGd555x1cu3ZNZ90JCQno06cPLC0tUbt2bXTr1g1Hjx7Vu43du3fXu/7Zs2fr1P70009wcXGBqakp6tatC19fX73rf9y2lVVaWoolS5agdevWMDExgY2NDcaPH4+7d+9K6ho3boz+/fvrrCcoKEhnTH29L1y4UGefAg9PJ8yaNQtNmzaFsbEx7O3tMW3aNBQWFurdV2V1795dZ7wvv/wSSqUSmzdvrtT++M9//oPOnTujXr16MDU1hYuLC3799Ve96//pp5/w1ltvoXbt2qhTpw48PDywf/9+Sc3evXvRrVs3WFhYQKVSoWPHjjq9RUZGiq9p/fr18f777+Off/6R1IwePVrSc506ddC9e/cKnRJ6lmXnzZsHhUKBjRs36hwpbtKkCcLDw3Hz5k2sXbsWAJCfn4+UlBTY29vD29sbKpUKZmZmOuv73//+B4VCgcWLF+us89ixY1AoFPj5558B6H+dta9n2euOzpw5g9GjR+ONN96AiYkJbG1tMXbsWNy5c0eyrPZSg8uXL4vT9u3bh86dO6N27dqwtLRE//79kZKSIllu9uzZUCgUuH37tjgtMTFRpw8AaNOmjd4jOXv37kXXrl1hZmYGCwsLeHt749y5c5Ka0aNHo3HjxuI+dnV1RVZWFkxNTXX61qeir3d5n2kt7c887RGy1NRU3L17FxYWFujWrdtj9xUAnDp1Cn379oVKpYK5uTl69eqFP//8U1KjfS3i4uIwfvx41KtXDyqVCiNHjtT7M2j06NGSaQEBATAxMdE5ileR/fyy4Ok/eiraAFSvXj0AD38YR0VFYfDgwXBwcEBGRgbWrl2Lbt264a+//oJarQYAlJSUoH///oiNjYWvry8mTpyIe/fuISYmBikpKWjSpIm4jmHDhqFfv36S9YaEhOjt58svv4RCocD06dNx69YtLFmyBJ6enkhOToapqSkA4MCBA+jbty9cXFwwa9YsKJVKbNiwAT179sSRI0fw1ltv6Yz7+uuvIywsDACQl5eHjz76SO+6Z86ciSFDhuCDDz5AZmYmli9fDg8PD5w6dQpWVlY6ywQEBKBr164AgP/+97/Yvn27ZP748ePFo4SffPIJ0tPTsWLFCpw6dQpHjx5FrVq19O6Hp5GdnS1uW1mlpaV455138McffyAgIAAtW7bE2bNnsXjxYly8eBFRUVFPtZ4NGzbgiy++wKJFi8o97fSk/bF06VK888478PPzQ1FREX755RcMHjwYu3btgre3t1g3Z84czJ49G507d8bcuXNhZGSEhIQEHDhwAL179wbw8BfG2LFj0bp1a4SEhMDKygqnTp1CdHS02J9233fs2BFhYWHIyMjA0qVLcfToUZ3XtH79+mIIuX79OpYuXYp+/frh2rVrel/7siqz7P379xEbG4uuXbvCwcFBb83QoUMREBCAXbt2YcaMGWKAWbBgAWxtbTF16lSYmJhg/fr18PT0RExMDDw8PPDGG2/A3d0dmzZtwqeffioZc9OmTbCwsMD//d//PXabHhUTE4P//e9/GDNmDGxtbXHu3DmsW7cO586dw59//qkToLWOHDmCfv36oVGjRpg1axaKi4uxatUquLu748SJE2jevPlT9VGeH3/8EaNGjYKXlxcWLFiA+/fvY/Xq1ejSpQtOnTolBil9QkNDUVBQUOF1Pct7pTza1zYkJATNmjXDnDlzUFBQgJUrV+rsq3PnzqFr165QqVSYNm0aatWqhbVr16J79+44fPgwXF1dJWMHBQXBysoKs2fPRmpqKlavXo0rV66IwU6fWbNm4bvvvsOWLVskAfZZ9nONJBDpsWHDBgGA8PvvvwuZmZnCtWvXhF9++UWoV6+eYGpqKly/fl0QBEEoKCgQSkpKJMump6cLxsbGwty5c8Vp33//vQBA+Oabb3TWVVpaKi4HQFi4cKFOTevWrYVu3bqJzw8ePCgAEF577TUhNzdXnL5161YBgLB06VJx7GbNmgleXl7iegRBEO7fvy84ODgIb7/9ts66OnfuLLRp00Z8npmZKQAQZs2aJU67fPmyYGBgIHz55ZeSZc+ePSsYGhrqTE9LSxMACBs3bhSnzZo1Syj7ETxy5IgAQNi0aZNk2ejoaJ3pjRo1Ery9vXV6DwwMFB79WD/a+7Rp0wRra2vBxcVFsk9//PFHQalUCkeOHJEsv2bNGgGAcPToUZ31ldWtWzdxvN27dwuGhobC5MmT9dZWZH8IwsPXqayioiKhTZs2Qs+ePSVjKZVK4d1339V5L2pf8+zsbMHCwkJwdXUV/v33X701RUVFgrW1tdCmTRtJza5duwQAQmhoqDht1KhRQqNGjSTjrFu3TgAgHD9+XO82P+uyycnJAgBh4sSJjx2/bdu2Qt26dQVB+P+fKSMjI+HixYtiTWZmplCvXj3BxcVFnLZ27VoBgHD+/HlxWlFRkVC/fn1h1KhR4rQePXoIHh4eknVq17NhwwZx2qOvnSAIws8//ywAEOLi4sRp2p816enpgiAIgouLi2BpaSloNBqx5uLFi0KtWrWEQYMGidO075fMzExx2okTJ3T6EATdnx/37t0TrKyshHHjxknqNBqNYGlpKZn+6OuVkpIiKJVKoW/fvpK+y1PR17u8z7SW9mfewYMHJc/r168v3L59W6zTt698fHwEIyMj4dKlS+K0GzduCBYWFpLXUvtauLi4CEVFReL08PBwAYCwY8cOSb/a94X2vbN8+XJJz0+zn18WPP1Hj+Xp6YkGDRrA3t4evr6+MDc3x/bt2/Haa68BAIyNjaFUPnwblZSU4M6dOzA3N0eLFi1w8uRJcZxt27ahfv36mDB
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение классов в Контрольной выборке:\n",
|
|||
|
"HeartDisease\n",
|
|||
|
"No 61442\n",
|
|||
|
"Yes 5715\n",
|
|||
|
"Name: count, dtype: int64\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAk0AAAHHCAYAAACiOWx7AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABPZ0lEQVR4nO3de3zP9f//8fs2dsi8N8fNMqbmfMyIJRTLMPVR5JBPIVI+o1g5rMOcKqRiSDqaPqWQosicT7EcVnMKyWei2MxhG4uN7fX7w/f9+nl7j16Weo9u18vlfbnY6/V4PV+P12t7v933Os3NMAxDAAAAuCp3VzcAAABwIyA0AQAAWEBoAgAAsIDQBAAAYAGhCQAAwAJCEwAAgAWEJgAAAAsITQAAABaUcHUDAACg6PLy8nTy5EkVFBQoKCjI1e3c1DjSBABAIT7++GMdPHjQ/DohIUG//fab6xq6xLZt2/TII4+ofPny8vLyUqVKldSlSxdXt3XTIzTd5BISEuTm5ma+vL29VaNGDQ0aNEjp6emubg/4Rxo9erTc3Nx0/Phxp3kvvPCC3NzcNHjwYBd0hktt2LBBw4cP18GDB7Vs2TJFR0fL3d31/20uWrRId999t3788Ue98sorWrFihVasWKF33nnH1a3d9Dg99w8xduxYVatWTefOndO3336rt99+W99884127dqlW265xdXtAZA0depUvfrqq+revbvi4+Nd3c4/3tChQ3XPPfeoWrVqkqSYmBhVqlTJpT2dPHlS/fv3V2RkpObPny9PT0+X9vNPQ2j6h+jQoYOaNGkiSerfv7/KlSunN998U4sWLVLPnj1d3B2AOXPmaMiQIYqIiNBHH31ULI5o/NPVqlVLBw4c0K5du1S+fHndfvvtrm5Js2bN0rlz55SQkEBgcgHelf9Qbdq0kSSlpqZKuvjby3PPPaf69evL19dXNptNHTp00Pbt252WPXfunEaPHq0aNWrI29tblSpV0kMPPaQDBw5Ikg4ePOhwSvDy1z333GOOtXbtWrm5uWnu3Ll6/vnnFRgYqFKlSumBBx7Q4cOHnda9efNmtW/fXn5+frrlllvUunVrbdy4sdBtvOeeewpd/+jRo51qP/74Y4WFhcnHx0dly5ZVjx49Cl3/1bbtUgUFBZoyZYrq1q0rb29vBQQE6Mknn9SpU6cc6kJCQtSpUyen9QwaNMhpzMJ6nzRpktM+laTc3FyNGjVKoaGh8vLyUnBwsIYPH67c3NxC99Wl7rnnHqfxXnnlFbm7u2vOnDlF2h+vv/667rrrLpUrV04+Pj4KCwvT559/Xuj6P/74Y91555265ZZbVKZMGbVq1UrLly93qFm6dKlat26t0qVLy2azqWnTpk69zZ8/3/yeli9fXv/+97+drkfp06ePQ89lypTRPffcow0bNvzhfvozy15u2bJl6tOnjxo3bqwvvvii0P8MZ8yYobp168rLy0tBQUGKjo5WZmamQ80999yjevXqOS37+uuvy83Nzbw+JyQk5Krv0ZCQEEn///v7+uuva/Lkyapatap8fHzUunVr7dq1y2k9q1evVsuWLVWqVCn5+/vrX//6l/bs2VPoNl+ph7Vr1zrUFPb+uNSlPV6uXr16Tj/Lx44dU79+/RQQECBvb281bNhQs2fPLnTMhIQElSpVSs2aNdPtt9+u6Ohoubm5qU+fPpZ6sr9KliypkJAQDRs2THl5eWad/fKJbdu2XXGsy9+P3333nRo1aqRXX31VwcHB8vLyUvXq1TVhwgQVFBQ4LHvhwgWNGzdOt99+u7y8vBQSEqLnn3/e6XPAvp+XL1+uRo0aydvbW3Xq1NEXX3zhUGfv99LrvHbv3q0yZcqoU6dOunDhgjk9MzNTQ4YMMXsMDQ3VxIkTnXq80XCk6R/KHnDKlSsnSfrf//6nhQsX6uGHH1a1atWUnp6ud955R61bt9aPP/5o3pGRn5+vTp06adWqVerRo4eeeeYZnT59WitWrNCuXbscfhPr2bOnOnbs6LDe2NjYQvt55ZVX5ObmphEjRujYsWOaMmWKIiIilJKSIh8fH0kXP5A7dOigsLAwjRo1Su7u7po1a5batGmjDRs26M4773Qat3Llyho/frwk6cyZMxo4cGCh637ppZfUrVs39e/fXxkZGZo2bZpatWqlH374Qf7+/k7LDBgwQC1btpQkffHFF/ryyy8d5j/55JNKSEhQ37599fTTTys1NVXTp0/XDz/8oI0bN6pkyZKF7odrkZmZaW7bpQoKCvTAAw/o22+/1YABA1S7dm3t3LlTkydP1k8//aSFCxde03pmzZqlF198UW+88YYeeeSRQmv+aH/Ex8frgQceUK9evZSXl6fPPvtMDz/8sBYvXqyoqCizbsyYMRo9erTuuusujR07Vp6entq8ebNWr16tdu3aSbr4wf3444+rbt26io2Nlb+/v3744QclJiaa/dn3fdOmTTV+/Hilp6crPj5eGzdudPqeli9fXpMnT5Yk/frrr4qPj1fHjh11+PDhQr/3l/ozy9pt2bJFXbp0UUhIiJYuXarSpUs71YwePVpjxoxRRESEBg4cqH379untt9/W1q1bi/TzNGXKFJ05c0aStGfPHr366qt6/vnnVbt2bUmSr6+vQ/1HH32k06dPKzo6WufOnVN8fLzatGmjnTt3KiAgQJK0cuVKdejQQbfddptGjx6ts2fPatq0aWrRooW+//57M4hdqmXLlhowYIBDH3+ls2fP6p577tHPP/+sQYMGqVq1apo/f7769OmjzMxMPfPMM1dc9ueff9Z77713Teuzvy9yc3O1bNkyvf766/L29ta4ceOKvA0nTpzQt99+q2+//VaPP/64wsLCtGrVKsXGxurgwYOaOXOmWdu/f3/Nnj1bXbt21bPPPqvNmzdr/Pjx2rNnj9N7dP/+/erevbueeuop9e7dW7NmzdLDDz+sxMRE3XfffYX2cvjwYbVv3161atXSvHnzVKLExUjx+++/q3Xr1vrtt9/05JNPqkqVKtq0aZNiY2N19OhRTZkypcjb73IGbmqzZs0yJBkrV640MjIyjMOHDxufffaZUa5cOcPHx8f49ddfDcMwjHPnzhn5+fkOy6amphpeXl7G2LFjzWkffvihIcl48803ndZVUFBgLifJmDRpklNN3bp1jdatW5tfr1mzxpBk3HrrrUZ2drY5fd68eYYkIz4+3hy7evXqRmRkpLkewzCM33//3ahWrZpx3333Oa3rrrvuMurVq2d+nZGRYUgyRo0aZU47ePCg4eHhYbzyyisOy+7cudMoUaKE0/T9+/cbkozZs2eb00aNGmVc+lbasGGDIcn45JNPHJZNTEx0ml61alUjKirKqffo6Gjj8rfn5b0PHz7cqFixohEWFuawT//73/8a7u7uxoYNGxyWnzlzpiHJ2Lhxo9P6LtW6dWtzvCVLlhglSpQwnn322UJrrewPw7j4fbpUXl6eUa9ePaNNmzYOY7m7uxsPPvig08+i/XuemZlplC5d2mjWrJlx9uzZQmvy8vKMihUrGvXq1XOoWbx4sSHJiIuLM6f17t3bqFq1qsM47777riHJ2LJlS6HbfD2Wte+jDRs2GOXKlTMkGQMGDCi09tixY4anp6fRrl07h/0yffp0Q5Lx4YcfmtNat25t1K1b12mMSZMmGZKM1NRUp3n29+CaNWuc5tnfy5d+VhiGYWzevNmQZAwdOtSc1qhRI6NixYrGiRMnzGnbt2833N3djccee8xp7FtvvdXo27fvVfu40vujsB6tfN5MmTLFkGR8/PHH5rS8vDwjPDzc8PX1NT+D7GPOmjXLrOvWrZtRr149Izg42Ojdu7elni5d3jAMIygoyOjYsaP5tf3zeevWrVcc69L3o/1rScbo0aMd6vr06WNIMnbu3GkYhmGkpKQYkoz+/fs71D333HOGJGP16tXmtKpVqxqSjAULFpjTsrKyjEqVKhl33HGHU7+pqanGyZMnjTp16hg1a9Y0jh8/7rCOcePGGaVKlTJ++uknh+kjR440PDw8jEOHDl1xe4s7Ts/9Q0RERKhChQoKDg5Wjx495Ovrqy+
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение классов в Тестовой выборке:\n",
|
|||
|
"HeartDisease\n",
|
|||
|
"No 87649\n",
|
|||
|
"Yes 8290\n",
|
|||
|
"Name: count, dtype: int64\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAk0AAAHHCAYAAACiOWx7AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABGQ0lEQVR4nO3dd3gUdfv+/XOTkCIhhJYmAXLTS25KKAYEBCKhyRdFKSLSBNQAYpCm0kGaSBURC6CCIiBFkNCbiJQgTQGRO0iRhFCSQCiBZJ4//GUelg04BGSDvl/HscfBzlzzmWtms5uTmdmJzTAMQwAAALgjF2c3AAAA8DAgNAEAAFhAaAIAALCA0AQAAGABoQkAAMACQhMAAIAFhCYAAAALCE0AAAAWuDm7AQAA/q2uXr2q8+fPy83NTX5+fs5uB3+BI00AgH+cadOmKSkpyXw+adIkpaamOq+hm6xdu1bNmzeXr6+vvLy89Oijj+q1115zdluwgNAES2bPni2bzWY+PD09VapUKfXo0UMJCQnObg/4Vzl27Jjd+/FOj2PHjjm7Xaf49ttvNXToUJ04cUJz587VoEGD5OXl5ey2NH36dEVGRio5OVmTJ0/WmjVrtGbNGg0fPtzZrcECTs/hrgwfPlwhISG6evWqvv/+e33wwQf67rvvdODAAT3yyCPObg/4VyhUqJA+//xzu2kTJkzQyZMnNXHiRIfaf6M333xTzZs31+TJk+Xi4qIJEybIxcW5xwmOHDmi6OhodevWTdOnT5fNZnNqP7h7Nv5gL6yYPXu2OnXqpJ07d6pq1arm9D59+ui9997TvHnz1LZtWyd2CPy7NWvWTAcOHPjXHlnKSlJSkg4ePKjg4GAVLlzY2e2oZ8+e+vbbb3XkyBHlypXL2e0gGzg9h3tSv359SVJcXJwk6fz583rjjTcUGhoqb29v+fj4qHHjxtq7d6/DslevXtXQoUNVqlQpeXp6KjAwUM8884yOHj0q6a9PQTzxxBPmWBs3bpTNZtP8+fP15ptvKiAgQLlz51bz5s114sQJh3Vv375djRo1Ut68efXII4+obt262rp1a5bb+MQTT2S5/qFDhzrUfvHFFwoLC5OXl5fy58+vNm3aZLn+O23bzTIyMjRp0iSVL19enp6e8vf3V/fu3XXhwgW7umLFiqlZs2YO6+nRo4fDmFn1Pn78eId9KknXrl3TkCFDVKJECXl4eCg4OFj9+vXTtWvXstxXN3viiSccxhs1apRcXFw0b968bO2Pd999VzVr1lSBAgXk5eWlsLAwLVy4MMv1f/HFF6pevboeeeQR5cuXT3Xq1NHq1avtalauXKm6desqT5488vHxUbVq1Rx6W7BggfmaFixYUC+88IJOnTplV9OxY0e7nvPly6cnnnhCW7Zs+cv9dC/L3o2kpCT17t1bwcHB8vDwUIkSJTR27FhlZGTY1WVkZGjy5MkKDQ2Vp6enChUqpEaNGmnXrl2S9JenA29+zc+cOaMuXbrI399fnp6eqlixoubMmWO3vltf+1y5cqlYsWLq27ev0tLS7Gr/97//6bnnnlP+/Pn1yCOP6LHHHtOKFSvsajI/CzZu3ChfX1+Fh4ercOHCatq06W3ft1ktn/nw8PBQqVKlNHr0aN18jGHo0KGy2Ww6e/bsbccqVqyYOnbsaD7/8ccfFRYWpldffVX+/v7y8PBQhQoV9NFHHzksm5qaqj59+pivV+nSpfXuu+/q1uMcNptNPXr00Ny5c1W6dGl5enoqLCxMmzdvtqvL7PdmGzZskIeHh15++WW76adOnVLnzp3NHsuXL69PP/30jvvt34LTc7gnmQGnQIECkv78UFuyZImee+45hYSEKCEhQR9++KHq1q2rX375RUFBQZKk9PR0NWvWTOvWrVObNm302muv6eLFi1qzZo0OHDig4sWLm+to27atmjRpYrfegQMHZtnPqFGjZLPZ1L9/f505c0aTJk1SRESE9uzZY17PsH79ejVu3FhhYWEaMmSIXFxcNGvWLNWvX19btmxR9erVHcYtXLiwRo8eLUm6dOmSXnnllSzXPWjQILVq1UovvfSSEhMTNXXqVNWpU0c//fSTfH19HZbp1q2bateuLUn65ptvtHjxYrv53bt3N4/y9erVS3FxcZo2bZp++uknbd269b78bzUpKcnctptlZGSoefPm+v7779WtWzeVLVtW+/fv18SJE/Xrr79qyZIld7WeWbNm6e2339aECRP0/PPPZ1nzV/tj8uTJat68udq1a6e0tDR99dVXeu6557R8+XI1bdrUrBs2bJiGDh2qmjVravjw4XJ3d9f27du1fv16NWzYUNKfR087d+6s8uXLa+DAgfL19dVPP/2kmJgYs7/MfV+tWjWNHj1aCQkJmjx5srZu3erwmhYsWNA8NXby5ElNnjxZTZo00YkTJ7J87W92L8tacfnyZdWtW1enTp1S9+7dVaRIEf3www8aOHCgTp8+rUmTJpm1Xbp00ezZs9W4cWO99NJLunHjhrZs2aIff/xRVatWtTstuGXLFs2cOVMTJ05UwYIFJUn+/v6SpCtXruiJJ57Qb7/9ph49eigkJEQLFixQx44dlZSU5HDhc+Zrf+3aNa1atUrvvvuuPD09NWLECElSQkKCatasqcuXL6tXr14qUKCA5syZo+bNm2vhwoV6+umnb7v9mzdv1nfffXdX++zNN99U2bJldeXKFfM/Y35+furSpctdjXOzc+fOadeuXXJzc1NUVJSKFy+uJUuWqFu3bjp37pwGDBggSTIMQ82bN9eGDRvUpUsXVapUSatWrVLfvn116tQph1OwmzZt0vz589WrVy95eHho+vTpatSokXbs2KEKFSpk2cvevXvVokULNWnSRO+//745PSEhQY899pgZxgoVKqSVK1eqS5cuSklJUe/evbO9/f8IBmDBrFmzDEnG2rVrjcTEROPEiRPGV199ZRQoUMDw8vIyTp48aRiGYVy9etVIT0+3WzYuLs7w8PAwhg8fbk779NNPDUnGe++957CujIwMczlJxvjx4x1qypcvb9StW9d8vmHDBkOS8eijjxopKSnm9K+//tqQZEyePNkcu2TJkkZkZKS5HsMwjMuXLxshISHGk08+6bCumjVrGhUqVDCfJyYmGpKMIUOGmNOOHTtmuLq6GqNGjbJbdv/+/Yabm5vD9CNHjhiSjDlz5pjThgwZYtz8ltyyZYshyZg7d67dsjExMQ7TixYtajRt2tSh96ioKOPWt/mtvffr18/w8/MzwsLC7Pbp559/bri4uBhbtmyxW37GjBmGJGPr1q0O67tZ3bp1zfFWrFhhuLm5GX369Mmy1sr+MIw/X6ebpaWlGRUqVDDq169vN5aLi4vx9NNPO/wsZr7mSUlJRp48eYwaNWoYV65cybImLS3N8PPzMypUqGBXs3z5ckOSMXjwYHNahw4djKJFi9qNM3PmTEOSsWPHjiy3+X4se7OmTZs6jJNpxIgRRu7cuY1ff/3VbvqAAQMMV1dX4/jx44ZhGMb69esNSUavXr0cxrj5/ZIp83MhLi7OYd6kSZMMScYXX3xhTktLSzPCw8MNb29v832a+T6fNWuW3fJBQUFGkyZNzOe9e/c2JNn9PF68eNEICQkxihUrZr7WmZ8FGzZsMOtq1KhhNG7c2OFnPytZLX/16lXDxcXFePXVV81pmT+fiYmJtx2raNGiRocOHeyeSzJmz55tTrtx44bRoEEDw8PDwzh79qxhGIaxZMkSQ5IxcuRIu/GeffZZw2azGb/99ps5TZIhydi1a5c57ffffzc8PT2Np59+2qFfw/jz8yowMNB4/PHHHX7+u3TpYgQGBpq9ZGrTpo2RN29eh/fgvw2n53BXIiIiVKhQIQUHB6tNmzby9vbW4sWL9eijj0qSPDw8zIst09PTde7cOXl7e6t06dLavXu3Oc6iRYtUsGBB9ezZ02Ed93Jx5Isvvqg8efKYz5999lkFBgaa/8vcs2ePjhw5oueff17nzp3T2bNndfbsWaWmpqpBgwbavHmzw+mKq1evytPT847r/eabb5SRkaFWrVqZY549e1YBAQE
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"\n",
|
|||
|
"# Проверка распределения классов в целевой переменной\n",
|
|||
|
"class_distribution = df['HeartDisease'].value_counts()\n",
|
|||
|
"print(\"Распределение классов в HeartDisease:\")\n",
|
|||
|
"print(class_distribution)\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация распределения классов\n",
|
|||
|
"sns.countplot(x='HeartDisease', data=df)\n",
|
|||
|
"plt.title('Распределение классов в HeartDisease')\n",
|
|||
|
"plt.show()\n",
|
|||
|
"\n",
|
|||
|
"# Проверка сбалансированности для каждой выборки\n",
|
|||
|
"def check_balance(df, title):\n",
|
|||
|
" class_distribution = df['HeartDisease'].value_counts()\n",
|
|||
|
" print(f\"Распределение классов в {title}:\")\n",
|
|||
|
" print(class_distribution)\n",
|
|||
|
" sns.countplot(x='HeartDisease', data=df)\n",
|
|||
|
" plt.title(f'Распределение классов в {title}')\n",
|
|||
|
" plt.show()\n",
|
|||
|
"\n",
|
|||
|
"# Проверка сбалансированности для обучающей, контрольной и тестовой выборок\n",
|
|||
|
"check_balance(train_df, 'Обучающей выборке')\n",
|
|||
|
"check_balance(val_df, 'Контрольной выборке')\n",
|
|||
|
"check_balance(test_df, 'Тестовой выборке')"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Можно заметить, что данные не сбалансированы - во всех выборках количество значений \"No\" превышает \"Yes\" в среднем в 10 раз. Для балансировки данных будет применен метод upsampling"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 43,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"\n",
|
|||
|
"Распределение классов в всем датасете:\n",
|
|||
|
"Класс No: 292422 (91.44%)\n",
|
|||
|
"Класс Yes: 27373 (8.56%)\n",
|
|||
|
"\n",
|
|||
|
"Распределение классов в Обучающей выборке до upsampling:\n",
|
|||
|
"Класс No: 143331 (91.47%)\n",
|
|||
|
"Класс Yes: 13368 (8.53%)\n",
|
|||
|
"Размер обучающей выборки после upsampling: 286662\n",
|
|||
|
"\n",
|
|||
|
"Распределение классов в Обучающей выборке после upsampling:\n",
|
|||
|
"Класс No: 143331 (50.00%)\n",
|
|||
|
"Класс Yes: 143331 (50.00%)\n",
|
|||
|
"\n",
|
|||
|
"Распределение классов в Контрольной выборке:\n",
|
|||
|
"Класс No: 61442 (91.49%)\n",
|
|||
|
"Класс Yes: 5715 (8.51%)\n",
|
|||
|
"\n",
|
|||
|
"Распределение классов в Тестовой выборке:\n",
|
|||
|
"Класс No: 87649 (91.36%)\n",
|
|||
|
"Класс Yes: 8290 (8.64%)\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from imblearn.over_sampling import RandomOverSampler\n",
|
|||
|
"\n",
|
|||
|
"# Функция для проверки балансировки данных\n",
|
|||
|
"def check_balance(df, title):\n",
|
|||
|
" class_distribution = df['HeartDisease'].value_counts()\n",
|
|||
|
" print(f\"\\nРаспределение классов в {title}:\")\n",
|
|||
|
" for cls, count in class_distribution.items():\n",
|
|||
|
" print(f\"Класс {cls}: {count} ({count / len(df) * 100:.2f}%)\")\n",
|
|||
|
"\n",
|
|||
|
"# Проверка балансировки для всего датасета\n",
|
|||
|
"check_balance(df, 'всем датасете')\n",
|
|||
|
"\n",
|
|||
|
"# Проверка балансировки для обучающей выборки до upsampling\n",
|
|||
|
"check_balance(train_df, 'Обучающей выборке до upsampling')\n",
|
|||
|
"\n",
|
|||
|
"# Применение upsampling к обучающей выборке\n",
|
|||
|
"X_train = train_df.drop('HeartDisease', axis=1) # Отделяем признаки от целевой переменной\n",
|
|||
|
"y_train = train_df['HeartDisease'] # Целевая переменная\n",
|
|||
|
"\n",
|
|||
|
"# Инициализация RandomOverSampler\n",
|
|||
|
"ros = RandomOverSampler(random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Применение upsampling\n",
|
|||
|
"X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)\n",
|
|||
|
"\n",
|
|||
|
"# Создание нового DataFrame с балансированными данными\n",
|
|||
|
"train_df_resampled = pd.concat([X_train_resampled, y_train_resampled], axis=1)\n",
|
|||
|
"\n",
|
|||
|
"# Вывод размеров выборок после upsampling\n",
|
|||
|
"print(\"Размер обучающей выборки после upsampling:\", len(train_df_resampled))\n",
|
|||
|
"\n",
|
|||
|
"# Проверка балансировки для обучающей выборки после upsampling\n",
|
|||
|
"check_balance(train_df_resampled, 'Обучающей выборке после upsampling')\n",
|
|||
|
"\n",
|
|||
|
"# Проверка балансировки для контрольной и тестовой выборок (они не должны измениться)\n",
|
|||
|
"check_balance(val_df, 'Контрольной выборке')\n",
|
|||
|
"check_balance(test_df, 'Тестовой выборке')"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Данные были сбалансированы. Теперь можно перейти к конструированию признаков. Поставлены следующие задачи:\n",
|
|||
|
"- Разработка персонализированных программ профилактики сердечно-сосудистых заболеваний\n",
|
|||
|
"- Улучшение качества медицинской помощи"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### Унитарное кодирование категориальных признаков (one-hot encoding)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 44,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Определение категориальных признаков\n",
|
|||
|
"categorical_features = [\n",
|
|||
|
" 'Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'Sex', 'AgeCategory', 'Race',\n",
|
|||
|
" 'Diabetic', 'PhysicalActivity', 'GenHealth', 'Asthma', 'KidneyDisease', 'SkinCancer'\n",
|
|||
|
"]\n",
|
|||
|
"\n",
|
|||
|
"# Применение one-hot encoding к обучающей выборке\n",
|
|||
|
"train_df_resampled_encoded = pd.get_dummies(train_df_resampled, columns=categorical_features)\n",
|
|||
|
"\n",
|
|||
|
"# Применение one-hot encoding к контрольной выборке\n",
|
|||
|
"val_df_encoded = pd.get_dummies(val_df, columns=categorical_features)\n",
|
|||
|
"\n",
|
|||
|
"# Применение one-hot encoding к тестовой выборке\n",
|
|||
|
"test_df_encoded = pd.get_dummies(test_df, columns=categorical_features)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### Дискретизация числовых признаков"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 45,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Применение upsampling к обучающей выборке\n",
|
|||
|
"X_train = train_df.drop('HeartDisease', axis=1) # Отделяем признаки от целевой переменной\n",
|
|||
|
"y_train = train_df['HeartDisease'] # Целевая переменная\n",
|
|||
|
"\n",
|
|||
|
"# Инициализация RandomOverSampler\n",
|
|||
|
"ros = RandomOverSampler(random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Применение upsampling\n",
|
|||
|
"X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)\n",
|
|||
|
"\n",
|
|||
|
"# Создание нового DataFrame с балансированными данными\n",
|
|||
|
"train_df_resampled = pd.concat([X_train_resampled, y_train_resampled], axis=1)\n",
|
|||
|
"\n",
|
|||
|
"# Определение числовых признаков для дискретизации\n",
|
|||
|
"numerical_features = ['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime']\n",
|
|||
|
"\n",
|
|||
|
"# Функция для дискретизации числовых признаков\n",
|
|||
|
"def discretize_features(df, features, bins=5, labels=False):\n",
|
|||
|
" for feature in features:\n",
|
|||
|
" df[f'{feature}_bin'] = pd.cut(df[feature], bins=bins, labels=labels)\n",
|
|||
|
" return df\n",
|
|||
|
"\n",
|
|||
|
"# Применение дискретизации к обучающей, контрольной и тестовой выборкам\n",
|
|||
|
"train_df_resampled = discretize_features(train_df_resampled, numerical_features)\n",
|
|||
|
"val_df = discretize_features(val_df, numerical_features)\n",
|
|||
|
"test_df = discretize_features(test_df, numerical_features)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### Ручной синтез. Создание новых признаков на основе экспертных знаний и логики предметной области. Например, для данных о продаже автомобилей можно создать признак \"возраст автомобиля\" как разницу между текущим годом и годом выпуска."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 46,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Применение upsampling к обучающей выборке\n",
|
|||
|
"X_train = train_df.drop('HeartDisease', axis=1) # Отделяем признаки от целевой переменной\n",
|
|||
|
"y_train = train_df['HeartDisease'] # Целевая переменная\n",
|
|||
|
"\n",
|
|||
|
"# Инициализация RandomOverSampler\n",
|
|||
|
"ros = RandomOverSampler(random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Применение upsampling\n",
|
|||
|
"X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)\n",
|
|||
|
"\n",
|
|||
|
"# Создание нового DataFrame с балансированными данными\n",
|
|||
|
"train_df_resampled = pd.concat([X_train_resampled, y_train_resampled], axis=1)\n",
|
|||
|
"\n",
|
|||
|
"# Создание нового признака \"Age\" на основе категориального признака \"AgeCategory\"\n",
|
|||
|
"age_mapping = {\n",
|
|||
|
" '18-24': 21,\n",
|
|||
|
" '25-29': 27,\n",
|
|||
|
" '30-34': 32,\n",
|
|||
|
" '35-39': 37,\n",
|
|||
|
" '40-44': 42,\n",
|
|||
|
" '45-49': 47,\n",
|
|||
|
" '50-54': 52,\n",
|
|||
|
" '55-59': 57,\n",
|
|||
|
" '60-64': 62,\n",
|
|||
|
" '65-69': 67,\n",
|
|||
|
" '70-74': 72,\n",
|
|||
|
" '75-79': 77,\n",
|
|||
|
" '80 or older': 80\n",
|
|||
|
"}\n",
|
|||
|
"\n",
|
|||
|
"train_df_resampled['Age'] = train_df_resampled['AgeCategory'].map(age_mapping)\n",
|
|||
|
"val_df['Age'] = val_df['AgeCategory'].map(age_mapping)\n",
|
|||
|
"test_df['Age'] = test_df['AgeCategory'].map(age_mapping)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### Масштабирование признаков - это процесс преобразования числовых признаков таким образом, чтобы они имели одинаковый масштаб. Это важно для многих алгоритмов машинного обучения, которые чувствительны к масштабу признаков, таких как линейная регрессия, метод опорных векторов (SVM) и нейронные сети."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 47,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"from imblearn.over_sampling import RandomOverSampler\n",
|
|||
|
"from sklearn.preprocessing import StandardScaler\n",
|
|||
|
"\n",
|
|||
|
"# Применение upsampling к обучающей выборке\n",
|
|||
|
"X_train = train_df.drop('HeartDisease', axis=1) # Отделяем признаки от целевой переменной\n",
|
|||
|
"y_train = train_df['HeartDisease'] # Целевая переменная\n",
|
|||
|
"\n",
|
|||
|
"# Инициализация RandomOverSampler\n",
|
|||
|
"ros = RandomOverSampler(random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Применение upsampling\n",
|
|||
|
"X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)\n",
|
|||
|
"\n",
|
|||
|
"# Создание нового DataFrame с балансированными данными\n",
|
|||
|
"train_df_resampled = pd.concat([X_train_resampled, y_train_resampled], axis=1)\n",
|
|||
|
"\n",
|
|||
|
"# Создание нового признака \"Age\" на основе категориального признака \"AgeCategory\"\n",
|
|||
|
"age_mapping = {\n",
|
|||
|
" '18-24': 21,\n",
|
|||
|
" '25-29': 27,\n",
|
|||
|
" '30-34': 32,\n",
|
|||
|
" '35-39': 37,\n",
|
|||
|
" '40-44': 42,\n",
|
|||
|
" '45-49': 47,\n",
|
|||
|
" '50-54': 52,\n",
|
|||
|
" '55-59': 57,\n",
|
|||
|
" '60-64': 62,\n",
|
|||
|
" '65-69': 67,\n",
|
|||
|
" '70-74': 72,\n",
|
|||
|
" '75-79': 77,\n",
|
|||
|
" '80 or older': 80\n",
|
|||
|
"}\n",
|
|||
|
"\n",
|
|||
|
"train_df_resampled['Age'] = train_df_resampled['AgeCategory'].map(age_mapping)\n",
|
|||
|
"val_df['Age'] = val_df['AgeCategory'].map(age_mapping)\n",
|
|||
|
"test_df['Age'] = test_df['AgeCategory'].map(age_mapping)\n",
|
|||
|
"\n",
|
|||
|
"# Определение числовых признаков для масштабирования\n",
|
|||
|
"numerical_features_to_scale = ['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime', 'Age']\n",
|
|||
|
"\n",
|
|||
|
"# Инициализация StandardScaler\n",
|
|||
|
"scaler = StandardScaler()\n",
|
|||
|
"\n",
|
|||
|
"# Масштабирование числовых признаков в обучающей выборке\n",
|
|||
|
"train_df_resampled[numerical_features_to_scale] = scaler.fit_transform(train_df_resampled[numerical_features_to_scale])\n",
|
|||
|
"\n",
|
|||
|
"# Масштабирование числовых признаков в контрольной и тестовой выборках\n",
|
|||
|
"val_df[numerical_features_to_scale] = scaler.transform(val_df[numerical_features_to_scale])\n",
|
|||
|
"test_df[numerical_features_to_scale] = scaler.transform(test_df[numerical_features_to_scale])"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### Конструирование признаков с применением фреймворка Featuretools"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 48,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:1733: UserWarning: index id not found in dataframe, creating new integer column\n",
|
|||
|
" warnings.warn(\n",
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n",
|
|||
|
" warnings.warn(\n",
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
|
|||
|
" series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n",
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Обучающая выборка после конструирования признаков:\n",
|
|||
|
" HeartDisease BMI Smoking AlcoholDrinking Stroke PhysicalHealth \\\n",
|
|||
|
"id \n",
|
|||
|
"0 False 32.23 False False False 0.0 \n",
|
|||
|
"1 False 29.53 False False False 0.0 \n",
|
|||
|
"2 False 30.13 False False False 0.0 \n",
|
|||
|
"3 False 35.43 False False False 0.0 \n",
|
|||
|
"4 False 29.53 False False False 0.0 \n",
|
|||
|
"\n",
|
|||
|
" MentalHealth DiffWalking Sex AgeCategory Race Diabetic \\\n",
|
|||
|
"id \n",
|
|||
|
"0 0.0 True Male 75-79 White Yes \n",
|
|||
|
"1 0.0 True Female 50-54 White No \n",
|
|||
|
"2 0.0 False Male 50-54 White No \n",
|
|||
|
"3 15.0 False Female 18-24 Hispanic No \n",
|
|||
|
"4 0.0 True Female 65-69 White Yes \n",
|
|||
|
"\n",
|
|||
|
" PhysicalActivity GenHealth SleepTime Asthma KidneyDisease SkinCancer \\\n",
|
|||
|
"id \n",
|
|||
|
"0 True Fair 6.0 False True True \n",
|
|||
|
"1 True Very good 8.0 False False False \n",
|
|||
|
"2 True Excellent 7.0 False False False \n",
|
|||
|
"3 True Good 7.0 False False False \n",
|
|||
|
"4 False Good 10.0 False False False \n",
|
|||
|
"\n",
|
|||
|
" Age \n",
|
|||
|
"id \n",
|
|||
|
"0 77 \n",
|
|||
|
"1 52 \n",
|
|||
|
"2 52 \n",
|
|||
|
"3 21 \n",
|
|||
|
"4 67 \n",
|
|||
|
"Контрольная выборка после конструирования признаков:\n",
|
|||
|
" HeartDisease BMI Smoking AlcoholDrinking Stroke PhysicalHealth \\\n",
|
|||
|
"id \n",
|
|||
|
"80125 False 22.71 False False False 0.0 \n",
|
|||
|
"116296 False 25.80 False False False 0.0 \n",
|
|||
|
"18780 False 17.74 True False False 0.0 \n",
|
|||
|
"233006 <NA> NaN <NA> <NA> <NA> NaN \n",
|
|||
|
"182306 <NA> NaN <NA> <NA> <NA> NaN \n",
|
|||
|
"\n",
|
|||
|
" MentalHealth DiffWalking Sex AgeCategory Race Diabetic \\\n",
|
|||
|
"id \n",
|
|||
|
"80125 0.0 False Male 25-29 White No \n",
|
|||
|
"116296 0.0 False Male 50-54 Hispanic No \n",
|
|||
|
"18780 0.0 False Female 65-69 White No \n",
|
|||
|
"233006 NaN <NA> NaN NaN NaN NaN \n",
|
|||
|
"182306 NaN <NA> NaN NaN NaN NaN \n",
|
|||
|
"\n",
|
|||
|
" PhysicalActivity GenHealth SleepTime Asthma KidneyDisease \\\n",
|
|||
|
"id \n",
|
|||
|
"80125 True Excellent 8.0 False False \n",
|
|||
|
"116296 True Good 7.0 False False \n",
|
|||
|
"18780 True Good 7.0 False False \n",
|
|||
|
"233006 <NA> NaN NaN <NA> <NA> \n",
|
|||
|
"182306 <NA> NaN NaN <NA> <NA> \n",
|
|||
|
"\n",
|
|||
|
" SkinCancer Age \n",
|
|||
|
"id \n",
|
|||
|
"80125 False 27 \n",
|
|||
|
"116296 False 52 \n",
|
|||
|
"18780 False 67 \n",
|
|||
|
"233006 <NA> <NA> \n",
|
|||
|
"182306 <NA> <NA> \n",
|
|||
|
"Тестовая выборка после конструирования признаков:\n",
|
|||
|
" HeartDisease BMI Smoking AlcoholDrinking Stroke PhysicalHealth \\\n",
|
|||
|
"id \n",
|
|||
|
"271884 <NA> NaN <NA> <NA> <NA> NaN \n",
|
|||
|
"270361 <NA> NaN <NA> <NA> <NA> NaN \n",
|
|||
|
"219060 <NA> NaN <NA> <NA> <NA> NaN \n",
|
|||
|
"24010 False 26.5 False False False 14.0 \n",
|
|||
|
"181930 <NA> NaN <NA> <NA> <NA> NaN \n",
|
|||
|
"\n",
|
|||
|
" MentalHealth DiffWalking Sex AgeCategory Race Diabetic \\\n",
|
|||
|
"id \n",
|
|||
|
"271884 NaN <NA> NaN NaN NaN NaN \n",
|
|||
|
"270361 NaN <NA> NaN NaN NaN NaN \n",
|
|||
|
"219060 NaN <NA> NaN NaN NaN NaN \n",
|
|||
|
"24010 0.0 True Male 75-79 White Yes \n",
|
|||
|
"181930 NaN <NA> NaN NaN NaN NaN \n",
|
|||
|
"\n",
|
|||
|
" PhysicalActivity GenHealth SleepTime Asthma KidneyDisease \\\n",
|
|||
|
"id \n",
|
|||
|
"271884 <NA> NaN NaN <NA> <NA> \n",
|
|||
|
"270361 <NA> NaN NaN <NA> <NA> \n",
|
|||
|
"219060 <NA> NaN NaN <NA> <NA> \n",
|
|||
|
"24010 True Excellent 9.0 False False \n",
|
|||
|
"181930 <NA> NaN NaN <NA> <NA> \n",
|
|||
|
"\n",
|
|||
|
" SkinCancer Age \n",
|
|||
|
"id \n",
|
|||
|
"271884 <NA> <NA> \n",
|
|||
|
"270361 <NA> <NA> \n",
|
|||
|
"219060 <NA> <NA> \n",
|
|||
|
"24010 False 77 \n",
|
|||
|
"181930 <NA> <NA> \n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
|
|||
|
" series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import featuretools as ft\n",
|
|||
|
"\n",
|
|||
|
"# Создание нового признака \"Age\" на основе категориального признака \"AgeCategory\"\n",
|
|||
|
"age_mapping = {\n",
|
|||
|
" '18-24': 21,\n",
|
|||
|
" '25-29': 27,\n",
|
|||
|
" '30-34': 32,\n",
|
|||
|
" '35-39': 37,\n",
|
|||
|
" '40-44': 42,\n",
|
|||
|
" '45-49': 47,\n",
|
|||
|
" '50-54': 52,\n",
|
|||
|
" '55-59': 57,\n",
|
|||
|
" '60-64': 62,\n",
|
|||
|
" '65-69': 67,\n",
|
|||
|
" '70-74': 72,\n",
|
|||
|
" '75-79': 77,\n",
|
|||
|
" '80 or older': 80\n",
|
|||
|
"}\n",
|
|||
|
"\n",
|
|||
|
"train_df['Age'] = train_df['AgeCategory'].map(age_mapping)\n",
|
|||
|
"val_df['Age'] = val_df['AgeCategory'].map(age_mapping)\n",
|
|||
|
"test_df['Age'] = test_df['AgeCategory'].map(age_mapping)\n",
|
|||
|
"\n",
|
|||
|
"# Определение сущностей\n",
|
|||
|
"es = ft.EntitySet(id='heart_data')\n",
|
|||
|
"es = es.add_dataframe(dataframe_name='train', dataframe=train_df, index='id')\n",
|
|||
|
"\n",
|
|||
|
"# Генерация признаков\n",
|
|||
|
"feature_matrix, feature_defs = ft.dfs(entityset=es, target_dataframe_name='train', max_depth=2)\n",
|
|||
|
"\n",
|
|||
|
"# Преобразование признаков для контрольной и тестовой выборок\n",
|
|||
|
"val_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=val_df.index)\n",
|
|||
|
"test_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=test_df.index)\n",
|
|||
|
"\n",
|
|||
|
"# Вывод первых нескольких строк для проверки\n",
|
|||
|
"print(\"Обучающая выборка после конструирования признаков:\")\n",
|
|||
|
"print(feature_matrix.head())\n",
|
|||
|
"print(\"Контрольная выборка после конструирования признаков:\")\n",
|
|||
|
"print(val_feature_matrix.head())\n",
|
|||
|
"print(\"Тестовая выборка после конструирования признаков:\")\n",
|
|||
|
"print(test_feature_matrix.head())"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### Оценка качества каждого набора признаков\n",
|
|||
|
"- Предсказательная способность Метрики: RMSE, MAE, R²\n",
|
|||
|
"- Методы: Обучение модели на обучающей выборке и оценка на контрольной и тестовой выборках.\n",
|
|||
|
"- Скорость вычисления Методы: Измерение времени выполнения генерации признаков и обучения модели.\n",
|
|||
|
"- Надежность Методы: Кросс-валидация, анализ чувствительности модели к изменениям в данных.\n",
|
|||
|
"- Корреляция Методы: Анализ корреляционной матрицы признаков, удаление мультиколлинеарных признаков.\n",
|
|||
|
"- Цельность Методы: Проверка логической связи между признаками и целевой переменной, интерпретация результатов модели."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 49,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки: 156699\n",
|
|||
|
"Размер контрольной выборки: 67157\n",
|
|||
|
"Размер тестовой выборки: 95939\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n",
|
|||
|
" warnings.warn(\n",
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
|
|||
|
" series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n",
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
|
|||
|
" series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Feature Importance:\n",
|
|||
|
" feature importance\n",
|
|||
|
"0 HeartDisease 0.854583\n",
|
|||
|
"5 Age 0.016474\n",
|
|||
|
"1 BMI 0.013826\n",
|
|||
|
"11 Stroke_True 0.009034\n",
|
|||
|
"2 PhysicalHealth 0.008447\n",
|
|||
|
"12 DiffWalking_False 0.008264\n",
|
|||
|
"4 SleepTime 0.007288\n",
|
|||
|
"37 Diabetic_Yes 0.007211\n",
|
|||
|
"10 Stroke_False 0.006678\n",
|
|||
|
"44 GenHealth_Poor 0.005835\n",
|
|||
|
"42 GenHealth_Fair 0.005484\n",
|
|||
|
"13 DiffWalking_True 0.005208\n",
|
|||
|
"3 MentalHealth 0.004902\n",
|
|||
|
"35 Diabetic_No 0.003429\n",
|
|||
|
"48 KidneyDisease_False 0.003063\n",
|
|||
|
"28 AgeCategory_80 or older 0.003052\n",
|
|||
|
"14 Sex_Female 0.002828\n",
|
|||
|
"15 Sex_Male 0.002511\n",
|
|||
|
"49 KidneyDisease_True 0.002422\n",
|
|||
|
"6 Smoking_False 0.001900\n",
|
|||
|
"7 Smoking_True 0.001779\n",
|
|||
|
"41 GenHealth_Excellent 0.001690\n",
|
|||
|
"45 GenHealth_Very good 0.001673\n",
|
|||
|
"39 PhysicalActivity_False 0.001611\n",
|
|||
|
"43 GenHealth_Good 0.001548\n",
|
|||
|
"34 Race_White 0.001539\n",
|
|||
|
"40 PhysicalActivity_True 0.001510\n",
|
|||
|
"50 SkinCancer_False 0.001392\n",
|
|||
|
"51 SkinCancer_True 0.001354\n",
|
|||
|
"47 Asthma_True 0.001334\n",
|
|||
|
"46 Asthma_False 0.001333\n",
|
|||
|
"27 AgeCategory_75-79 0.001169\n",
|
|||
|
"26 AgeCategory_70-74 0.000973\n",
|
|||
|
"31 Race_Black 0.000868\n",
|
|||
|
"24 AgeCategory_60-64 0.000837\n",
|
|||
|
"32 Race_Hispanic 0.000803\n",
|
|||
|
"25 AgeCategory_65-69 0.000783\n",
|
|||
|
"33 Race_Other 0.000662\n",
|
|||
|
"23 AgeCategory_55-59 0.000617\n",
|
|||
|
"8 AlcoholDrinking_False 0.000559\n",
|
|||
|
"9 AlcoholDrinking_True 0.000550\n",
|
|||
|
"29 Race_American Indian/Alaskan Native 0.000514\n",
|
|||
|
"36 Diabetic_No, borderline diabetes 0.000471\n",
|
|||
|
"22 AgeCategory_50-54 0.000441\n",
|
|||
|
"21 AgeCategory_45-49 0.000326\n",
|
|||
|
"20 AgeCategory_40-44 0.000283\n",
|
|||
|
"30 Race_Asian 0.000265\n",
|
|||
|
"19 AgeCategory_35-39 0.000218\n",
|
|||
|
"38 Diabetic_Yes (during pregnancy) 0.000135\n",
|
|||
|
"18 AgeCategory_30-34 0.000124\n",
|
|||
|
"17 AgeCategory_25-29 0.000108\n",
|
|||
|
"16 AgeCategory_18-24 0.000094\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from imblearn.over_sampling import RandomOverSampler\n",
|
|||
|
"from sklearn.ensemble import RandomForestClassifier\n",
|
|||
|
"\n",
|
|||
|
"# Вывод размеров выборок\n",
|
|||
|
"print(\"Размер обучающей выборки:\", len(train_df))\n",
|
|||
|
"print(\"Размер контрольной выборки:\", len(val_df))\n",
|
|||
|
"print(\"Размер тестовой выборки:\", len(test_df))\n",
|
|||
|
"\n",
|
|||
|
"# Определение категориальных признаков\n",
|
|||
|
"categorical_features = [\n",
|
|||
|
" 'Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'Sex', 'AgeCategory', 'Race',\n",
|
|||
|
" 'Diabetic', 'PhysicalActivity', 'GenHealth', 'Asthma', 'KidneyDisease', 'SkinCancer'\n",
|
|||
|
"]\n",
|
|||
|
"\n",
|
|||
|
"# Применение one-hot encoding к обучающей выборке\n",
|
|||
|
"train_df_encoded = pd.get_dummies(train_df, columns=categorical_features)\n",
|
|||
|
"\n",
|
|||
|
"# Применение one-hot encoding к контрольной выборке\n",
|
|||
|
"val_df_encoded = pd.get_dummies(val_df, columns=categorical_features)\n",
|
|||
|
"\n",
|
|||
|
"# Применение one-hot encoding к тестовой выборке\n",
|
|||
|
"test_df_encoded = pd.get_dummies(test_df, columns=categorical_features)\n",
|
|||
|
"\n",
|
|||
|
"# Определение сущностей\n",
|
|||
|
"es = ft.EntitySet(id='heart_data')\n",
|
|||
|
"es = es.add_dataframe(dataframe_name='heart', dataframe=train_df_encoded, index='id')\n",
|
|||
|
"\n",
|
|||
|
"# Генерация признаков\n",
|
|||
|
"feature_matrix, feature_defs = ft.dfs(entityset=es, target_dataframe_name='heart', max_depth=2)\n",
|
|||
|
"\n",
|
|||
|
"# Преобразование признаков для контрольной и тестовой выборок\n",
|
|||
|
"val_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=val_df_encoded.index)\n",
|
|||
|
"test_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=test_df_encoded.index)\n",
|
|||
|
"\n",
|
|||
|
"# Оценка важности признаков\n",
|
|||
|
"X = feature_matrix\n",
|
|||
|
"y = train_df_encoded['HeartDisease']\n",
|
|||
|
"\n",
|
|||
|
"# Разделение данных на обучающую и тестовую выборки\n",
|
|||
|
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Обучение модели\n",
|
|||
|
"model = RandomForestClassifier(n_estimators=100, random_state=42)\n",
|
|||
|
"model.fit(X_train, y_train)\n",
|
|||
|
"\n",
|
|||
|
"# Получение важности признаков\n",
|
|||
|
"importances = model.feature_importances_\n",
|
|||
|
"feature_names = feature_matrix.columns\n",
|
|||
|
"\n",
|
|||
|
"# Сортировка признаков по важности\n",
|
|||
|
"feature_importance = pd.DataFrame({'feature': feature_names, 'importance': importances})\n",
|
|||
|
"feature_importance = feature_importance.sort_values(by='importance', ascending=False)\n",
|
|||
|
"\n",
|
|||
|
"print(\"Feature Importance:\")\n",
|
|||
|
"print(feature_importance)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 50,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки: 15670\n",
|
|||
|
"Размер контрольной выборки: 6716\n",
|
|||
|
"Размер тестовой выборки: 9594\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:1733: UserWarning: index id not found in dataframe, creating new integer column\n",
|
|||
|
" warnings.warn(\n",
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"c:\\Users\\HomePC\\Desktop\\MII_Lab1\\.venv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Accuracy: 1.0\n",
|
|||
|
"Precision: 1.0\n",
|
|||
|
"Recall: 1.0\n",
|
|||
|
"F1 Score: 1.0\n",
|
|||
|
"ROC AUC: 1.0\n",
|
|||
|
"Cross-validated Accuracy: 0.906126356094448\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAABD8AAAIjCAYAAAAEDbCUAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd1gV1/bw8e+hl0NRREFEEMEuKkETRRFbwFijEbuiYm8YjYodG0Yllhg1NrDFEqPEiJ2IBU0sEayxoIheC0YFxAIq8/7hy/w8oQjGxJT1eZ557pmZPXuvOeTmuWfdvdfWKIqiIIQQQgghhBBCCPEvpfeuAxBCCCGEEEIIIYT4M0nyQwghhBBCCCGEEP9qkvwQQgghhBBCCCHEv5okP4QQQgghhBBCCPGvJskPIYQQQgghhBBC/KtJ8kMIIYQQQgghhBD/apL8EEIIIYQQQgghxL+aJD+EEEIIIYQQQgjxrybJDyGEEEIIIYQQQvyrSfJDCCGEEEIIIYQQ/2qS/BBCCCGEeAciIiLQaDS5HqNHj/5Txjx8+DCTJk0iJSXlT+n/j8j+Po4fP/6uQ3ljCxcuJCIi4l2HIYQQIhcG7zoAIYQQQoj/ssmTJ1OmTBmda1WqVPlTxjp8+DAhISEEBARgbW39p4zxX7Zw4UKKFStGQEDAuw5FCCHE70jyQwghhBDiHWratCmenp7vOow/5NGjR5ibm7/rMN6Zx48fY2Zm9q7DEEIIkQ9Z9iKEEEII8Te2Y8cO6tWrh7m5ORYWFjRr1oyzZ8/qtDl16hQBAQG4uLhgYmKCnZ0dPXv25N69e2qbSZMm8dlnnwFQpkwZdYlNYmIiiYmJaDSaXJdsaDQaJk2apNOPRqPh3LlzdOrUiSJFilC3bl31/po1a3jvvfcwNTWlaNGidOjQgevXr7/RuwcEBKDVaklKSqJ58+ZotVocHBz46quvADh9+jQNGzbE3NwcJycnvvnmG53ns5fSHDhwgL59+2JjY4OlpSXdunXjwYMHOcZbuHAhlStXxtjYmJIlSzJw4MAcS4R8fHyoUqUKJ06cwNvbGzMzM8aMGYOzszNnz55l//796nfr4+MDwP379xkxYgRVq1ZFq9ViaWlJ06ZNiY+P1+k7JiYGjUbDxo0bmTZtGqVKlcLExIRGjRpx+fLlHPH+/PPPfPTRRxQpUgRzc3Pc3d2ZN2+eTptff/2VTz75hKJFi2JiYoKnpydbt24t7J9CCCH+8WTmhxBCCCHEO5Samspvv/2mc61YsWIArF69mu7du+Pr68vnn3/O48ePWbRoEXXr1uXkyZM4OzsDsGfPHq5cuUKPHj2ws7Pj7NmzLFmyhLNnz/LTTz+h0Who06YNFy9eZN26dcyZM0cdw9bWlrt37xY67nbt2uHm5sb06dNRFAWAadOmMX78ePz9/QkMDOTu3bt8+eWXeHt7c/LkyTdaavPixQuaNm2Kt7c3M2fOZO3atQwaNAhzc3PGjh1L586dadOmDYsXL6Zbt27Url07xzKiQYMGYW1tzaRJk7hw4QKLFi3i2rVrarIBXiZ1QkJCaNy4Mf3791fbHTt2jNjYWAwNDdX+7t27R9OmTenQoQNdunShRIkS+Pj4MHjwYLRaLWPHjgWgRIkSAFy5coXIyEjatWtHmTJluHPnDl9//TX169fn3LlzlCxZUifeGTNmoKenx4gRI0hNTWXmzJl07tyZn3/+WW2zZ88emjdvjr29PUOHDsXOzo7z58+zbds2hg4dCsDZs2fx8vLCwcGB0aNHY25uzsaNG2ndujXfffcdH3/8caH/HkII8Y+lCCGEEEKIv1x4eLgC5HooiqI8fPhQsba2Vnr37q3z3O3btxUrKyud648fP87R/7p16xRAOXDggHpt1qxZCqBcvXpVp+3Vq1cVQAkPD8/RD6BMnDhRPZ84caICKB07dtRpl5iYqOjr6yvTpk3TuX769GnFwMAgx/W8vo9jx46p17p3764AyvTp09VrDx48UExNTRWNRqOsX79evf7rr7/miDW7z/fee0/JzMxUr8+cOVMBlO+//15RFEVJTk5WjIyMlA8//FB58eKF2m7BggUKoKxYsUK9Vr9+fQVQFi9enOMdKleurNSvXz/H9adPn+r0qygvv3NjY2Nl8uTJ6rV9+/YpgFKxYkUlIyNDvT5v3jwFUE6fPq0oiqI8f/5cKVOmjOLk5KQ8ePBAp9+srCz1c6NGjZSqVasqT58+1blfp04dxc3NLUecQgjxbybLXoQQQggh3qGvvvqKPXv26Bzw8v/ZT0lJoWPHjvz222/qoa+vz/vvv8++ffvUPkxNTdXPT58+5bfffuODDz4A4JdffvlT4u7Xr5/O+ebNm8nKysLf318nXjs7O9zc3HTiLazAwED1s7W1NeXLl8fc3Bx/f3/1evny5bG2tubKlSs5nu/Tp4/OzI3+/ftjYGDA9u3bAdi7dy+ZmZkEBQWhp/d///O4d+/eWFpaEhUVpdOfsbExPXr0KHD8xsbGar8vXrzg3r17aLVaypcvn+vfp0ePHhgZGann9erVA1Df7eTJk1y9epWgoKAcs2myZ7Lcv3+fH3/8EX9/fx4+fKj+Pe7du4evry+XLl3if//7X4HfQQgh/ulk2YsQQgghxDtUq1atXAueXrp0CYCGDRvm+pylpaX6+f79+4SEhLB+/XqSk5N12qWmpr7FaP/P75eWXLp0CUVRcHNzy7X9q8mHwjAxMcHW1lbnmpWVFaVKlVJ/6L96PbdaHr+PSavVYm9vT2JiIgDXrl0DXiZQXmVkZISLi4t6P5uDg4NOcuJ1srKymDdvHgsXLuTq1au8ePFCvWdjY5OjfenSpXXOixQpAqC+W0JCApD/rkCXL19GURTGjx/P+PHjc22TnJyMg4NDgd9DCCH+yST5IYQQQgjxN5SVlQW8rPthZ2eX476Bwf/9zzh/f38OHz7MZ599RvXq1dFqtWRlZeHn56f2k5/fJxGyvfoj/fdenW2SHa9Go2HHjh3o6+vnaK/Val8bR25y6yu/68r/rz/yZ/r9u7/O9OnTGT9+PD179mTKlCkULVoUPT09goKCcv37vI13y+53xIgR+Pr65trG1dW1wP0JIcQ/nSQ/hBBCCCH+hsqWLQtA8eLFady4cZ7tHjx4QHR0NCEhIUyYMEG9nj1z5FV5JTmyZxb8fmeT3894eF28iqJQpkwZypUrV+Dn/gqXLl2iQYMG6nl6ejq3bt3io48+AsDJyQmACxcu4OLiorbLzMzk6tWr+X7/r8rr+920aRMNGjRg+fLlOtdTUlLUwrOFkf3PxpkzZ/KMLfs9DA0NCxy/EEL8m0nNDyGEEEKIvyFfX18sLS2ZPn06z549y3E/e4eW7FkCv58VMHfu3BzPmJubAzmTHJaWlhQrVowDBw7oXF+4cGGB423Tpg36+vqEhITkiEVRFJ1td/9qS5Ys0fkOFy1axPPnz2natCkAjRs3xsjIiPnz5+vEvnz5clJTU2nWrFmBxjE3N8/x3cLLv9Hvv5Nvv/32jWtueHh4UKZMGebOnZtjvOxxihcvjo+PD19//TW3bt3K0ceb7PAjhBD/ZDLzQwghhBDib8jS0pJFixbRtWtXPDw86NChA7a2tiQlJREVFYWXlxcLFizA0tJS3Qb22bNnODg4sHv3bq5evZqjz/feew+AsWPH0qFDBwwNDWnRogXm5uYEBgYyY8YMAgMD8fT05MCBA1y8eLHA8ZYtW5apU6cSHBxMYmIirVu3xsLCgqtXr7Jlyxb69OnDiBEj3tr3UxiZmZk0atQIf39/Lly4wMKFC6lbty4tW7YEXm73GxwcTEhICH5+frRs2VJtV7NmTbp06VKgcd577z0WLVrE1KlTcXV1pXjx4jRs2JDmzZszefJkevToQZ06dTh9+jRr167VmWVSGHp6eixatIgWLVpQvXp1evTogb29Pb/++itnz55l165dwMtiunXr1qVq1ar07t0bFxcX7ty5w5EjR7hx4wbx8fFvNL4QQvwTSfJDCCGEEOJvqlOnTpQsWZIZM2Ywa9YsMjIycHBwoF69ejq7jXzzzTcMHjyYr77
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Train Accuracy: 0.9994894703254626\n",
|
|||
|
"Train Precision: 0.9992816091954023\n",
|
|||
|
"Train Recall: 0.9949928469241774\n",
|
|||
|
"Train F1 Score: 0.9971326164874552\n",
|
|||
|
"Train ROC AUC: 0.9974613898298016\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA04AAAIjCAYAAAA0vUuxAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAB7UklEQVR4nO3dd3xO9///8eeVRBIjMWoTYu+dysdWFKVaSmvULFUl9qbEqL0J9bGqWlWKtmqVKlVFjaAUib23SiRGJDm/P/pzvp+rMXKR5GQ87rfbdbu5Xudc53pecUReeb/P+9gMwzAEAAAAAHgqJ6sDAAAAAEBiR+MEAAAAAM9B4wQAAAAAz0HjBAAAAADPQeMEAAAAAM9B4wQAAAAAz0HjBAAAAADPQeMEAAAAAM9B4wQAAAAAz0HjBACJlM1m04gRI6yOYbmaNWuqZs2a5vOzZ8/KZrNp8eLFlmX6t39nBF8TAMkPjROAFGHOnDmy2Wzy9fV94WNcvnxZI0aM0MGDB+MuWCK3bds22Ww285EqVSrlz59fbdu21enTp62O55CdO3dqxIgRunPnjmUZvL299eabbz5x2+Ov9cqVKxM41T+edX63b9/e7jxIly6d8ufPr2bNmmnVqlWKjo5O+MAAkMBcrA4AAAlh6dKl8vb21p49e3Ty5EkVLFjQ4WNcvnxZI0eOlLe3t8qWLRv3IROxHj166NVXX9WjR48UGBioefPmad26dTp8+LBy5syZoFny5s2r+/fvK1WqVA69bufOnRo5cqTat2+vDBkyxE+4JOx557ebm5sWLFggSbp//77OnTunH3/8Uc2aNVPNmjX1ww8/yNPT09x/06ZNCRUdABIEI04Akr0zZ85o586dmjp1qrJkyaKlS5daHSnJqVatmlq3bq0OHTpo1qxZmjx5sm7fvq0vvvjiqa8JDw+Plyw2m03u7u5ydnaOl+OnNJGRkYqIiHjufi4uLmrdurVat26tDz/8UJ9++qkOHTqkcePGadu2bfrwww/t9nd1dZWrq2t8xQaABEfjBCDZW7p0qTJmzKiGDRuqWbNmT22c7ty5o969e8vb21tubm7KnTu32rZtq5s3b2rbtm169dVXJUkdOnQwpyw9vs7G29tb7du3j3HMf1/nERERoeHDh6tChQpKnz690qZNq2rVqmnr1q0Of65r167JxcVFI0eOjLEtKChINptNAQEBkqRHjx5p5MiRKlSokNzd3fXKK6+oatWq2rx5s8PvK0m1atWS9E9TKkkjRoyQzWbT0aNH1apVK2XMmFFVq1Y19//qq69UoUIFpU6dWpkyZVKLFi104cKFGMedN2+eChQooNSpU6tixYr67bffYuzztGucjh8/rvfee09ZsmRR6tSpVaRIEQ0dOtTM179/f0lSvnz5zL+/s2fPxkvGuHTp0iV98MEHypYtm9zc3FSiRAktWrTIbp/YnlePv3aTJ0/W9OnTVaBAAbm5uWnOnDnPPL+fZdCgQapbt66+/fZbBQcHm/UnXeM0a9YslShRQmnSpFHGjBnl4+Ojr7/+Ot4+ryR98803qlChgjw8POTp6alSpUppxowZdvvcuXNHvXr1kpeXl9zc3FSwYEFNmDCBKYgA7DBVD0Cyt3TpUr3zzjtydXVVy5Yt9dlnn2nv3r3mD4qSFBYWpmrVqunYsWP64IMPVL58ed28eVNr1qzRxYsXVaxYMY0aNUrDhw9X586dVa1aNUlS5cqVHcoSGhqqBQsWqGXLlvrwww919+5dLVy4UPXq1dOePXscmgKYLVs21ahRQytWrJC/v7/dtuXLl8vZ2VnvvvuupH8ah3HjxqlTp06qWLGiQkNDtW/fPgUGBur111936DNI0qlTpyRJr7zyil393XffVaFChTR27FgZhiFJGjNmjIYNG6b33ntPnTp10o0bNzRr1ixVr15dBw4cMKfNLVy4UB999JEqV66sXr166fTp03rrrbeUKVMmeXl5PTPPn3/+qWrVqilVqlTq3LmzvL29derUKf34448aM2aM3nnnHQUHB2vZsmWaNm2aMmfOLEnKkiVLgmV87NGjR7p582aMekhISIzatWvX9J///Ec2m01+fn7KkiWLNmzYoI4dOyo0NFS9evWS5Ph59fnnn+vBgwfq3Lmz3Nzc1KRJE929e/eFz+82bdpo06ZN2rx5swoXLvzEfebPn68ePXqoWbNm6tmzpx48eKA///xTf/zxh1q1ahUvn3fz5s1q2bKlateurQkTJkiSjh07pt9//109e/aUJN27d081atTQpUuX9NFHHylPnjzauXOnBg8erCtXrmj69Omx+hoASAEMAEjG9u3bZ0gyNm/ebBiGYURHRxu5c+c2evbsabff8OHDDUnG6tWrYxwjOjraMAzD2Lt3ryHJ+Pzzz2PskzdvXqNdu3Yx6jVq1DBq1KhhPo+MjDQePnxot8/ff/9tZMuWzfjggw/s6pIMf3//Z36+//73v4Yk4/Dhw3b14sWLG7Vq1TKflylTxmjYsOEzj/UkW7duNSQZixYtMm7cuGFcvnzZWLduneHt7W3YbDZj7969hmEYhr+/vyHJaNmypd3rz549azg7Oxtjxoyxqx8+fNhwcXEx6xEREUbWrFmNsmXL2n195s2bZ0iy+xqeOXMmxt9D9erVDQ8PD+PcuXN27/P4784wDGPSpEmGJOPMmTPxnvFp8ubNa0h65uPbb7819+/YsaORI0cO4+bNm3bHadGihZE+fXrj3r17hmHE/rx6/LXz9PQ0rl+/brf/s87vdu3aGWnTpn3q5zpw4IAhyejdu7dZ+/e5//bbbxslSpR4+hcnHj5vz549DU9PTyMyMvKp7zl69Ggjbdq0RnBwsF190KBBhrOzs3H+/PlnZgaQcjBVD0CytnTpUmXLlk2vvfaapH+uj2nevLm++eYbRUVFmfutWrVKZcqUUZMmTWIcw2azxVkeZ2dn87qP6Oho3b59W5GRkfLx8VFgYKDDx3vnnXfk4uKi5cuXm7UjR47o6NGjat68uVnLkCGD/vrrL504ceKFcn/wwQfKkiWLcubMqYYNGyo8PFxffPGFfHx87Pbr0qWL3fPVq1crOjpa7733nm7evGk+smfPrkKFCplTq/bt26fr16+rS5cudtfFtG/fXunTp39mths3bmj79u364IMPlCdPHrttsfm7S4iM/8vX11ebN2+O8Zg8ebLdfoZhaNWqVWrUqJEMw7DLVq9ePYWEhJjnjKPnVdOmTc3RtriQLl06SdLdu3efuk+GDBl08eJF7d2794nb4+PzZsiQQeHh4c+ckvrtt9+qWrVqypgxo9171qlTR1FRUdq+fbvDXw8AyRNT9QAkW1FRUfrmm2/02muvmdfiSP/84DplyhRt2bJFdevWlfTP1LOmTZsmSK4vvvhCU6ZM0fHjx/Xo0SOzni9fPoePlTlzZtWuXVsrVqzQ6NGjJf0zTc/FxUXvvPOOud+oUaP09ttvq3DhwipZsqTq16+vNm3aqHTp0rF6n+HDh6tatWpydnZW5syZVaxYMbm4xPwv5N+f4cSJEzIMQ4UKFXricR+vjHfu3DlJirHf4+XPn+XxsuglS5aM1Wf5t4TI+L8yZ86sOnXqxKj/++t548YN3blzR/PmzdO8efOeeKzr16+bf3bkvHqRc+1ZwsLCJEkeHh5P3WfgwIH6+eefVbFiRRUsWFB169ZVq1atVKVKFUnx83m7du2qFStW6I033lCuXLlUt25dvffee6pfv765z4kTJ/Tnn38+tZH83/cEkLLROAFItn755RdduXJF33zzjb755psY25cuXWo2Ti/raSMbUVFRdqu/ffXVV2rfvr0aN26s/v37K2vWrHJ2dta4cePM64Yc1aJFC3Xo0EEHDx5U2bJltWLFCtWuXdu8jkeSqlevrlOnTumHH37Qpk2btGDBAk2bNk1z585Vp06dnvsepUqVeuIP+/+WOnVqu+fR0dGy2WzasGHDE1fBezxSYaXEmvHxwgStW7dWu3btnrjP48bX0fPq339PL+vIkSOS9Mxl/osVK6agoCCtXbtWGzdu1KpVqzRnzhwNHz5cI0eOjJf
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score\n",
|
|||
|
"from sklearn.model_selection import cross_val_score\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"\n",
|
|||
|
"# Уменьшение размера выборки для ускорения работы (опционально)\n",
|
|||
|
"df = df.sample(frac=0.1, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение на обучающую и тестовую выборки (например, 70% обучающая, 30% тестовая)\n",
|
|||
|
"train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение обучающей выборки на обучающую и контрольную (например, 70% обучающая, 30% контрольная)\n",
|
|||
|
"train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Вывод размеров выборок\n",
|
|||
|
"print(\"Размер обучающей выборки:\", len(train_df))\n",
|
|||
|
"print(\"Размер контрольной выборки:\", len(val_df))\n",
|
|||
|
"print(\"Размер тестовой выборки:\", len(test_df))\n",
|
|||
|
"\n",
|
|||
|
"# Определение категориальных признаков\n",
|
|||
|
"categorical_features = [\n",
|
|||
|
" 'Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'Sex', 'AgeCategory', 'Race',\n",
|
|||
|
" 'Diabetic', 'PhysicalActivity', 'GenHealth', 'Asthma', 'KidneyDisease', 'SkinCancer'\n",
|
|||
|
"]\n",
|
|||
|
"\n",
|
|||
|
"# Применение one-hot encoding к обучающей выборке\n",
|
|||
|
"train_df_encoded = pd.get_dummies(train_df, columns=categorical_features)\n",
|
|||
|
"\n",
|
|||
|
"# Применение one-hot encoding к контрольной выборке\n",
|
|||
|
"val_df_encoded = pd.get_dummies(val_df, columns=categorical_features)\n",
|
|||
|
"\n",
|
|||
|
"# Применение one-hot encoding к тестовой выборке\n",
|
|||
|
"test_df_encoded = pd.get_dummies(test_df, columns=categorical_features)\n",
|
|||
|
"\n",
|
|||
|
"# Определение сущностей\n",
|
|||
|
"es = ft.EntitySet(id='heart_data')\n",
|
|||
|
"es = es.add_dataframe(dataframe_name='heart', dataframe=train_df_encoded, index='id')\n",
|
|||
|
"\n",
|
|||
|
"# Генерация признаков с уменьшенной глубиной\n",
|
|||
|
"feature_matrix, feature_defs = ft.dfs(entityset=es, target_dataframe_name='heart', max_depth=1)\n",
|
|||
|
"\n",
|
|||
|
"# Преобразование признаков для контрольной и тестовой выборок\n",
|
|||
|
"val_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=val_df_encoded.index)\n",
|
|||
|
"test_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=test_df_encoded.index)\n",
|
|||
|
"\n",
|
|||
|
"# Удаление строк с NaN\n",
|
|||
|
"feature_matrix = feature_matrix.dropna()\n",
|
|||
|
"val_feature_matrix = val_feature_matrix.dropna()\n",
|
|||
|
"test_feature_matrix = test_feature_matrix.dropna()\n",
|
|||
|
"\n",
|
|||
|
"# Разделение данных на обучающую и тестовую выборки\n",
|
|||
|
"X_train = feature_matrix.drop('HeartDisease', axis=1)\n",
|
|||
|
"y_train = feature_matrix['HeartDisease']\n",
|
|||
|
"X_val = val_feature_matrix.drop('HeartDisease', axis=1)\n",
|
|||
|
"y_val = val_feature_matrix['HeartDisease']\n",
|
|||
|
"X_test = test_feature_matrix.drop('HeartDisease', axis=1)\n",
|
|||
|
"y_test = test_feature_matrix['HeartDisease']\n",
|
|||
|
"\n",
|
|||
|
"# Выбор модели\n",
|
|||
|
"model = RandomForestClassifier(random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Обучение модели\n",
|
|||
|
"model.fit(X_train, y_train)\n",
|
|||
|
"\n",
|
|||
|
"# Предсказание и оценка\n",
|
|||
|
"y_pred = model.predict(X_test)\n",
|
|||
|
"\n",
|
|||
|
"accuracy = accuracy_score(y_test, y_pred)\n",
|
|||
|
"precision = precision_score(y_test, y_pred)\n",
|
|||
|
"recall = recall_score(y_test, y_pred)\n",
|
|||
|
"f1 = f1_score(y_test, y_pred)\n",
|
|||
|
"roc_auc = roc_auc_score(y_test, y_pred)\n",
|
|||
|
"\n",
|
|||
|
"print(f\"Accuracy: {accuracy}\")\n",
|
|||
|
"print(f\"Precision: {precision}\")\n",
|
|||
|
"print(f\"Recall: {recall}\")\n",
|
|||
|
"print(f\"F1 Score: {f1}\")\n",
|
|||
|
"print(f\"ROC AUC: {roc_auc}\")\n",
|
|||
|
"\n",
|
|||
|
"# Кросс-валидация\n",
|
|||
|
"scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')\n",
|
|||
|
"accuracy_cv = scores.mean()\n",
|
|||
|
"print(f\"Cross-validated Accuracy: {accuracy_cv}\")\n",
|
|||
|
"\n",
|
|||
|
"# Анализ важности признаков\n",
|
|||
|
"feature_importances = model.feature_importances_\n",
|
|||
|
"feature_names = X_train.columns\n",
|
|||
|
"\n",
|
|||
|
"importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})\n",
|
|||
|
"importance_df = importance_df.sort_values(by='Importance', ascending=False)\n",
|
|||
|
"\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"sns.barplot(x='Importance', y='Feature', data=importance_df)\n",
|
|||
|
"plt.title('Feature Importance')\n",
|
|||
|
"plt.show()\n",
|
|||
|
"\n",
|
|||
|
"# Проверка на переобучение\n",
|
|||
|
"y_train_pred = model.predict(X_train)\n",
|
|||
|
"\n",
|
|||
|
"accuracy_train = accuracy_score(y_train, y_train_pred)\n",
|
|||
|
"precision_train = precision_score(y_train, y_train_pred)\n",
|
|||
|
"recall_train = recall_score(y_train, y_train_pred)\n",
|
|||
|
"f1_train = f1_score(y_train, y_train_pred)\n",
|
|||
|
"roc_auc_train = roc_auc_score(y_train, y_train_pred)\n",
|
|||
|
"\n",
|
|||
|
"print(f\"Train Accuracy: {accuracy_train}\")\n",
|
|||
|
"print(f\"Train Precision: {precision_train}\")\n",
|
|||
|
"print(f\"Train Recall: {recall_train}\")\n",
|
|||
|
"print(f\"Train F1 Score: {f1_train}\")\n",
|
|||
|
"print(f\"Train ROC AUC: {roc_auc_train}\")\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация результатов\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"plt.scatter(y_test, y_pred, alpha=0.5)\n",
|
|||
|
"plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)\n",
|
|||
|
"plt.xlabel('Actual HeartDisease')\n",
|
|||
|
"plt.ylabel('Predicted HeartDisease')\n",
|
|||
|
"plt.title('Actual vs Predicted HeartDisease')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": ".venv",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.12.6"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|