1205 lines
208 KiB
Plaintext
1205 lines
208 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Набор данных для анализа и прогнозирования сердечного приступа"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Вывод всех столбцов"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 2,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Index(['HeartDisease', 'BMI', 'Smoking', 'AlcoholDrinking', 'Stroke',\n",
|
|||
|
" 'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex', 'AgeCategory',\n",
|
|||
|
" 'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'SleepTime',\n",
|
|||
|
" 'Asthma', 'KidneyDisease', 'SkinCancer'],\n",
|
|||
|
" dtype='object')\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd \n",
|
|||
|
"df = pd.read_csv(\"..//static//csv//heart_2020_cleaned.csv\")\n",
|
|||
|
"print(df.columns)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Бизнес-цели: \n",
|
|||
|
"1. Разработка персонализированных программ профилактики сердечно-сосудистых заболеваний.\n",
|
|||
|
"Цель технического проекта: создание модели машинного обучения, которая будет прогнозировать риск сердечного приступа для каждого пациента на основе его индивидуальных факторов риска, и разработка онлайн-платформы или приложения для предоставления персонализированных рекомендаций по профилактике.\n",
|
|||
|
"2. Улучшение качества медицинской \n",
|
|||
|
"Цель технического проекта: использование данных для выявления групп населения с наибольшим риском сердечного приступа и разработки целевых программ профилактики и раннего выявления заболеваний."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Выполним разбиение на 3 выборки: обучающую, контрольную и тестовую"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 2,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки: 156699\n",
|
|||
|
"Размер контрольной выборки: 67157\n",
|
|||
|
"Размер тестовой выборки: 95939\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"\n",
|
|||
|
"df = pd.read_csv(\"..//static//csv//heart_2020_cleaned.csv\")\n",
|
|||
|
"\n",
|
|||
|
"# Разделение на обучающую и тестовую выборки (например, 70% обучающая, 30% тестовая)\n",
|
|||
|
"train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение обучающей выборки на обучающую и контрольную (например, 70% обучающая, 30% контрольная)\n",
|
|||
|
"train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Вывод размеров выборок\n",
|
|||
|
"print(\"Размер обучающей выборки:\", len(train_df))\n",
|
|||
|
"print(\"Размер контрольной выборки:\", len(val_df))\n",
|
|||
|
"print(\"Размер тестовой выборки:\", len(test_df))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 3,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение классов в HeartDisease:\n",
|
|||
|
"HeartDisease\n",
|
|||
|
"No 292422\n",
|
|||
|
"Yes 27373\n",
|
|||
|
"Name: count, dtype: int64\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAlUAAAHHCAYAAACWQK1nAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABJlUlEQVR4nO3deVgVdf//8dcBZVE8oCIgiUuaO2niEpm4RKJS3ablkne5ZnWjpZSad4b7TeXtlku2fBMrLbVSSxM13JU0KdxSM8NbS0FcACUFhfn90cX8PIKKOoro83Fd57o8M+/5zHuGwzkvZ+YMNsMwDAEAAOCGOBV1AwAAAHcCQhUAAIAFCFUAAAAWIFQBAABYgFAFAABgAUIVAACABQhVAAAAFiBUAQAAWIBQBQAAYAFCFQAAVxETEyObzaaDBw8WdSu4jRGqUCzkvaHlPdzc3FSzZk0NGDBAKSkpRd0ecFcaNWqUbDabjh8/XuD8qlWr6rHHHrvFXf1/M2fOVExMTL7pa9eudXg/cXV1la+vr1q1aqX//Oc/Sk1NvfXN4o5QoqgbAK7FmDFjVK1aNZ07d04bN27Ue++9p++++067du1SqVKliro9ALeRmTNnytvbW7169Spw/ssvv6wmTZooJydHqamp2rx5s0aOHKlJkyZpwYIFatOmjVn77LPPqlu3bnJ1db1F3aM4IlShWGnfvr0aN24sSerXr5/Kly+vSZMmacmSJerevXsRdwfgdvDXX38V6j9ZLVq00FNPPeUwbfv27Wrbtq06d+6sX375RRUrVpQkOTs7y9nZ+ab0izsHp/9QrOX9TzIpKUmSdPLkSb322msKDAyUh4eH7Ha72rdvr+3bt+db9ty5cxo1apRq1qwpNzc3VaxYUZ06ddKBAwckSQcPHnQ4RXDpo1WrVuZYeacT5s+fr3//+9/y8/NT6dKl9cQTT+jw4cP51r1lyxa1a9dOnp6eKlWqlFq2bKlNmzYVuI2tWrUqcP2jRo3KV/vZZ58pKChI7u7uKleunLp161bg+q+0bRfLzc3VlClTVK9ePbm5ucnX11cvvPCCTp065VB3udM8AwYMyDdmQb1PmDAh3z6VpKysLI0cOVI1atSQq6urAgICNHToUGVlZRW4ry7WqlWrfOONHz9eTk5Omjdv3nXtj//+97966KGHVL58ebm7uysoKEhffvllgev/7LPP1LRpU5UqVUply5ZVSEiIVq5c6VCzfPlytWzZUmXKlJHdbleTJk3y9bZw4ULzZ+rt7a1//vOf+vPPPx1qevXq5dBz2bJl1apVK23YsOGq++lGlr0ehX1NLVmyROHh4fL395erq6uqV6+usWPHKicnx6GuVatWql+/vhISEhQSEqJSpUrp3//+t6pWrardu3dr3bp1Bf7OXk6DBg00ZcoUpaWlafr06eb0gq6p2rZtm8LCwuTt7S13d3dVq1ZNffr0uanbu3//fnXu3Fl+fn5yc3NTpUqV1K1bN6WnpzvUFfa9ANbiSBWKtbwAVL58eUnS77//rsWLF+vpp59WtWrVlJKSovfff18tW7bUL7/8In9/f0lSTk6OHnvsMcXFxalbt2565ZVXdPr0aa1atUq7du1S9erVzXV0795dHTp0cFjv8OHDC+xn/PjxstlsGjZsmI4dO6YpU6YoNDRUiYmJcnd3lyStXr1a7du3V1BQkEaOHCknJyfNnj1bbdq00YYNG9S0adN841aqVEnR0dGSpDNnzuill14qcN1vvvmmunTpon79+ik1NVXTpk1TSEiIfv75Z3l5eeVbpn///mrRooUk6euvv9aiRYsc5r/wwguKiYlR79699fLLLyspKUnTp0/Xzz//rE2bNqlkyZIF7odrkZaWZm7bxXJzc/XEE09o48aN6t+/v+rUqaOdO3dq8uTJ+vXXX7V48eJrWs/s2bM1YsQITZw4Uc8880yBNVfbH1OnTtUTTzyhHj16KDs7W1988YWefvppLV26VOHh4Wbd6NGjNWrUKD300EMaM2aMXFxctGXLFq1evVpt27aV9PeHdJ8+fVSvXj0NHz5cXl5e+vnnnxUbG2v2l7fvmzRpoujoaKWkpGjq1KnatGlTvp+pt7e3Jk+eLEn6448/NHXqVHXo0EGHDx8u8Gd/sRtZVvr7PzMFyc3NzTetsK+pmJgYeXh4KDIyUh4eHlq9erWioqKUkZGhCRMmOIx54sQJtW/fXt26ddM///lP8/qogQMHysPDQ2+88YYkydfX96rbIklPPfWU+vbtq5UrV2r8+PEF1hw7dkxt27ZVhQoV9Prrr8vLy0sHDx7U119/fdO2Nzs7W2FhYcrKytLAgQPl5+enP//8U0uXLlVaWpo8PT0lXd97ASxiAMXA7NmzDUnG999/b6SmphqHDx82vvjiC6N8+fKGu7u78ccffxiGYRjnzp0zcnJyHJZNSkoyXF1djTFjxpjTPv74Y0OSMWnSpHzrys3NNZeTZEyYMCFfTb169YyWLVuaz9esWWNIMu655x4jIyPDnL5gwQJDkjF16lRz7Pvuu88ICwsz12MYhvHXX38Z1apVMx599NF863rooYeM+vXrm89TU1MNScbIkSPNaQcPHjScnZ2N8ePHOyy7c+dOo0SJEvmm79+/35BkzJkzx5w2cuRI4+K3hA0bNhiSjLlz5zosGxsbm296lSpVjPDw8Hy9R0REGJe+zVza+9ChQw0fHx8jKCjIYZ9++umnhpOTk7FhwwaH5WfNmmVIMjZt2pRvfRdr2bKlOd6yZcuMEiVKGK+++mqBtYXZH4bx98/pYtnZ2Ub9+vWNNm3aOIzl5ORkPPnkk/lei3k/87S0NKNMmTJGs2bNjLNnzxZYk52dbfj4+Bj169d3qFm6dKkhyYiKijKn9ezZ06hSpYrDOB988IEhydi6dWuB22zFsnn76EqPi18X1/KaunRfG4ZhvPDCC0apUqWMc+fOmdNatmxpSDJmzZqVr/7S39M8eb+vCxcuvOy2NWjQwChbtqz5PO89KCkpyTAMw1i0aJEhyfjxxx8vO4bV2/vzzz9fte9rfS+AtTj9h2IlNDRUFSpUUEBAgLp16yYPDw8tWrRI99xzjyTJ1dVVTk5/v6xzcnJ04sQJeXh4qFatWvrpp5/Mcb766it5e3tr4MCB+dZx6Smfa/Hcc8+pTJky5vOnnnpKFStW1HfffSdJSkxM1P79+/XMM8/oxIkTOn78uI4fP67MzEw98sgjWr9+fb7/3Z87d05ubm5XXO/XX3+t3NxcdenSxRzz+PHj8vPz03333ac1a9Y41GdnZ0vSFS+6XbhwoTw9PfXoo486jBkUFCQPD498Y54/f96h7vjx4zp37twV+/7zzz81bdo0vfnmm/Lw8Mi3/jp16qh27doOY+ad8r10/ZezdetWdenSRZ07d853hCNPYfaHJPNooySdOnVK6enpatGihcNra/HixcrNzVVUVJT5WsyT99patWqVTp8+rddffz3fzzavZtu2bTp27Jj+9a9/OdSEh4erdu3aWrZsmcNyubm55j5KTEzUJ598oooVK6pOnTpX3KYbXVb6+/dp1apV+R6XHhm6ltfUxfv69OnTOn78uFq0aKG//vpLe/fudRjX1dVVvXv3LlSvheXh4aHTp09fdn7e0Z6lS5fq/PnzBdZYvb15R6JWrFihv/76q8B1Xut7AazF6T8UKzNmzFDNmjVVokQJ+fr6qlatWg4fXLm5uZo6dapmzpyppKQkh+sR8k4RSn+fNqxVq5ZKlLD2V+C+++5zeG6z2VSjRg3zOoz9+/dLknr27HnZMdLT01W2bFnz+fHjx/ONe6n9+/fLMIzL1l16mi4tLU2S8gWZS8dMT0+Xj49PgfOPHTvm8HzlypWqUKHCFfu81MiRI+Xv768XXngh37VJ+/fv1549ey475qXrL8iff/6p8PBwZWZm6sSJE5cNzIXZH9LfH6Djxo1TYmKiw3VdF4974MABOTk5qW7dupcdJ++0df369S9b87///U+SVKtWrXzzateurY0bNzpMO3z4sMO+qlixor766qurbtONLitJISEh8vb2zjf90sB4La+p3bt
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение классов в Обучающей выборке:\n",
|
|||
|
"HeartDisease\n",
|
|||
|
"No 143331\n",
|
|||
|
"Yes 13368\n",
|
|||
|
"Name: count, dtype: int64\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAlUAAAHHCAYAAACWQK1nAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABUlUlEQVR4nO3deVQV9f8/8Oe9IIvABTeWW6jkhgtqoCGiuJGoaF/SVJRcSazANFypxLVI/Jj7XomVlpIf0VxQwgVTQkFRMUXyg1t6EUVAMBZhfn947vwY70URRwF9Ps6553hnXvOe18y9F57OzB0UgiAIICIiIqJnoqzqBoiIiIheBgxVRERERDJgqCIiIiKSAUMVERERkQwYqoiIiIhkwFBFREREJAOGKiIiIiIZMFQRERERyYChiojoFXPv3j1cvnwZ+fn5Vd0KySw7Oxt///03Hjx4UNWtvJIYqoiIXnKCIGDdunXo1KkTateuDZVKBQcHB/z0009V3VqNcP36dURERIjPL1++jE2bNlVdQ2UUFxcjPDwc7dq1g7GxMerUqYNmzZohNja2qlt7JTFUkV4RERFQKBTiw8TEBM2bN0dQUBAyMjKquj2iV9rRo0fx7rvvwsbGBsbGxmjcuDHGjx+Pq1ev6q0fPnw4PvzwQ7Rs2RI//vgjYmJi8Pvvv2PgwIEvuPOaSaFQIDAwEPv27cPly5cxbdo0HDlypKrbQmFhITw9PTFz5kx0794dkZGRiImJwYEDB+Dm5lbV7b2SDKu6Aare5s6dCwcHBxQUFOCPP/7A6tWrsWfPHqSkpKB27dpV3R7RK2f58uWYOHEi3njjDUyYMAF2dnY4f/48vv32W2zZsgV79uxB586dxfoffvgBW7ZswU8//YThw4dXYec112uvvYZx48ahT58+AAA7OzscOnSoapsCsGDBAiQkJGDfvn3o3r17VbdDABT8g8qkT0REBMaMGYMTJ06gQ4cO4vTJkyfjm2++webNmzFs2LAq7JDo1XP06FF4eHjA3d0d0dHRkv/YXLp0Ce7u7lAqlTh37hzq1KkDAHByckLbtm2rzemqmuzSpUu4ffs22rRpAzMzsyrt5cGDB7C2tsZHH32EL7/8skp7of+Pp//oqfTs2RMAkJ6eDgDIysrClClT4OTkBHNzc6hUKvTt2xenT5/WWbagoACzZ89G8+bNYWJiAjs7OwwcOBCXLl0C8PA6hbKnHB99lP2f2KFDh6BQKLBlyxZ89tlnsLW1hZmZGd555x1cu3ZNZ90JCQno06cPLC0tUbt2bXTr1g1Hjx7Vu43du3fXu/7Zs2fr1P70009wcXGBqakp6tatC19fX73rf9y2lVVaWoolS5agdevWMDExgY2NDcaPH4+7d+9K6ho3boz+/fvrrCcoKEhnTH29L1y4UGefAg9PJ8yaNQtNmzaFsbEx7O3tMW3aNBQWFurdV2V1795dZ7wvv/wSSqUSmzdvrtT++M9//oPOnTujXr16MDU1hYuLC3799Ve96//pp5/w1ltvoXbt2qhTpw48PDywf/9+Sc3evXvRrVs3WFhYQKVSoWPHjjq9RUZGiq9p/fr18f777+Off/6R1IwePVrSc506ddC9e/cKnRJ6lmXnzZsHhUKBjRs36hwpbtKkCcLDw3Hz5k2sXbsWAJCfn4+UlBTY29vD29sbKpUKZmZmOuv73//+B4VCgcWLF+us89ixY1AoFPj5558B6H+dta9n2euOzpw5g9GjR+ONN96AiYkJbG1tMXbsWNy5c0eyrPZSg8uXL4vT9u3bh86dO6N27dqwtLRE//79kZKSIllu9uzZUCgUuH37tjgtMTFRpw8AaNOmjd4jOXv37kXXrl1hZmYGCwsLeHt749y5c5Ka0aNHo3HjxuI+dnV1RVZWFkxNTXX61qeir3d5n2kt7c887RGy1NRU3L17FxYWFujWrdtj9xUAnDp1Cn379oVKpYK5uTl69eqFP//8U1KjfS3i4uIwfvx41KtXDyqVCiNHjtT7M2j06NGSaQEBATAxMdE5ileR/fyy4Ok/eiraAFSvXj0AD38YR0VFYfDgwXBwcEBGRgbWrl2Lbt264a+//oJarQYAlJSUoH///oiNjYWvry8mTpyIe/fuISYmBikpKWjSpIm4jmHDhqFfv36S9YaEhOjt58svv4RCocD06dNx69YtLFmyBJ6enkhOToapqSkA4MCBA+jbty9cXFwwa9YsKJVKbNiwAT179sSRI0fw1ltv6Yz7+uuvIywsDACQl5eHjz76SO+6Z86ciSFDhuCDDz5AZmYmli9fDg8PD5w6dQpWVlY6ywQEBKBr164AgP/+97/Yvn27ZP748ePFo4SffPIJ0tPTsWLFCpw6dQpHjx5FrVq19O6Hp5GdnS1uW1mlpaV455138McffyAgIAAtW7bE2bNnsXjxYly8eBFRUVFPtZ4NGzbgiy++wKJFi8o97fSk/bF06VK888478PPzQ1FREX755RcMHjwYu3btgre3t1g3Z84czJ49G507d8bcuXNhZGSEhIQEHDhwAL179wbw8BfG2LFj0bp1a4SEhMDKygqnTp1CdHS02J9233fs2BFhYWHIyMjA0qVLcfToUZ3XtH79+mIIuX79OpYuXYp+/frh2rVrel/7siqz7P379xEbG4uuXbvCwcFBb83QoUMREBCAXbt2YcaMGWKAWbBgAWxtbTF16lSYmJhg/fr18PT0RExMDDw8PPDGG2/A3d0dmzZtwqeffioZc9OmTbCwsMD//d//PXabHhUTE4P//e9/GDNmDGxtbXHu3DmsW7cO586dw59//qkToLWOHDmCfv36oVGjRpg1axaKi4uxatUquLu748SJE2jevPlT9VGeH3/8EaNGjYKXlxcWLFiA+/fvY/Xq1ejSpQtOnTolBil9QkNDUVBQUOF1Pct7pTza1zYkJATNmjXDnDlzUFBQgJUrV+rsq3PnzqFr165QqVSYNm0aatWqhbVr16J79+44fPgwXF1dJWMHBQXBysoKs2fPRmpqKlavXo0rV66IwU6fWbNm4bvvvsOWLVskAfZZ9nONJBDpsWHDBgGA8PvvvwuZmZnCtWvXhF9++UWoV6+eYGpqKly/fl0QBEEoKCgQSkpKJMump6cLxsbGwty5c8Vp33//vQBA+Oabb3TWVVpaKi4HQFi4cKFOTevWrYVu3bqJzw8ePCgAEF577TUhNzdXnL5161YBgLB06VJx7GbNmgleXl7iegRBEO7fvy84ODgIb7/9ts66OnfuLLRp00Z8npmZKQAQZs2aJU67fPmyYGBgIHz55ZeSZc+ePSsYGhrqTE9LSxMACBs3bhSnzZo1Syj7ETxy5IgAQNi0aZNk2ejoaJ3pjRo1Ery9vXV6DwwMFB79WD/a+7Rp0wRra2vBxcVFsk9//PFHQalUCkeOHJEsv2bNGgGAcPToUZ31ldWtWzdxvN27dwuGhobC5MmT9dZWZH8IwsPXqayioiKhTZs2Qs+ePSVjKZVK4d1339V5L2pf8+zsbMHCwkJwdXUV/v33X701RUVFgrW1tdCmTRtJza5duwQAQmhoqDht1KhRQqNGjSTjrFu3TgAgHD9+XO82P+uyycnJAgBh4sSJjx2/bdu2Qt26dQVB+P+fKSMjI+HixYtiTWZmplCvXj3BxcVFnLZ27VoBgHD+/HlxWlFRkVC/fn1h1KhR4rQePXoIHh4eknVq17NhwwZx2qOvnSAIws8//ywAEOLi4sRp2p816enpgiAIgouLi2BpaSloNBqx5uLFi0KtWrWEQYMGidO075fMzExx2okTJ3T6EATdnx/37t0TrKyshHHjxknqNBqNYGlpKZn+6OuVkpIiKJVKoW/fvpK+y1PR17u8z7SW9mfewYMHJc/r168v3L59W6zTt698fHwEIyMj4dKlS+K0GzduCBYWFpLXUvtauLi4CEVFReL08PBwAYCwY8cOSb/a94X2vbN8+XJJz0+zn18WPP1Hj+Xp6YkGDRrA3t4evr6+MDc3x/bt2/Haa68BAIyNjaFUPnwblZSU4M6dOzA3N0eLFi1w8uRJcZxt27ahfv36mDB
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение классов в Контрольной выборке:\n",
|
|||
|
"HeartDisease\n",
|
|||
|
"No 61442\n",
|
|||
|
"Yes 5715\n",
|
|||
|
"Name: count, dtype: int64\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAk0AAAHHCAYAAACiOWx7AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABPZ0lEQVR4nO3de3zP9f//8fs2dsi8N8fNMqbmfMyIJRTLMPVR5JBPIVI+o1g5rMOcKqRiSDqaPqWQosicT7EcVnMKyWei2MxhG4uN7fX7w/f9+nl7j16Weo9u18vlfbnY6/V4PV+P12t7v933Os3NMAxDAAAAuCp3VzcAAABwIyA0AQAAWEBoAgAAsIDQBAAAYAGhCQAAwAJCEwAAgAWEJgAAAAsITQAAABaUcHUDAACg6PLy8nTy5EkVFBQoKCjI1e3c1DjSBABAIT7++GMdPHjQ/DohIUG//fab6xq6xLZt2/TII4+ofPny8vLyUqVKldSlSxdXt3XTIzTd5BISEuTm5ma+vL29VaNGDQ0aNEjp6emubg/4Rxo9erTc3Nx0/Phxp3kvvPCC3NzcNHjwYBd0hktt2LBBw4cP18GDB7Vs2TJFR0fL3d31/20uWrRId999t3788Ue98sorWrFihVasWKF33nnH1a3d9Dg99w8xduxYVatWTefOndO3336rt99+W99884127dqlW265xdXtAZA0depUvfrqq+revbvi4+Nd3c4/3tChQ3XPPfeoWrVqkqSYmBhVqlTJpT2dPHlS/fv3V2RkpObPny9PT0+X9vNPQ2j6h+jQoYOaNGkiSerfv7/KlSunN998U4sWLVLPnj1d3B2AOXPmaMiQIYqIiNBHH31ULI5o/NPVqlVLBw4c0K5du1S+fHndfvvtrm5Js2bN0rlz55SQkEBgcgHelf9Qbdq0kSSlpqZKuvjby3PPPaf69evL19dXNptNHTp00Pbt252WPXfunEaPHq0aNWrI29tblSpV0kMPPaQDBw5Ikg4ePOhwSvDy1z333GOOtXbtWrm5uWnu3Ll6/vnnFRgYqFKlSumBBx7Q4cOHnda9efNmtW/fXn5+frrlllvUunVrbdy4sdBtvOeeewpd/+jRo51qP/74Y4WFhcnHx0dly5ZVjx49Cl3/1bbtUgUFBZoyZYrq1q0rb29vBQQE6Mknn9SpU6cc6kJCQtSpUyen9QwaNMhpzMJ6nzRpktM+laTc3FyNGjVKoaGh8vLyUnBwsIYPH67c3NxC99Wl7rnnHqfxXnnlFbm7u2vOnDlF2h+vv/667rrrLpUrV04+Pj4KCwvT559/Xuj6P/74Y91555265ZZbVKZMGbVq1UrLly93qFm6dKlat26t0qVLy2azqWnTpk69zZ8/3/yeli9fXv/+97+drkfp06ePQ89lypTRPffcow0bNvzhfvozy15u2bJl6tOnjxo3bqwvvvii0P8MZ8yYobp168rLy0tBQUGKjo5WZmamQ80999yjevXqOS37+uuvy83Nzbw+JyQk5Krv0ZCQEEn///v7+uuva/Lkyapatap8fHzUunVr7dq1y2k9q1evVsuWLVWqVCn5+/vrX//6l/bs2VPoNl+ph7Vr1zrUFPb+uNSlPV6uXr16Tj/Lx44dU79+/RQQECBvb281bNhQs2fPLnTMhIQElSpVSs2aNdPtt9+u6Ohoubm5qU+fPpZ6sr9KliypkJAQDRs2THl5eWad/fKJbdu2XXGsy9+P3333nRo1aqRXX31VwcHB8vLyUvXq1TVhwgQVFBQ4LHvhwgWNGzdOt99+u7y8vBQSEqLnn3/e6XPAvp+XL1+uRo0aydvbW3Xq1NEXX3zhUGfv99LrvHbv3q0yZcqoU6dOunDhgjk9MzNTQ4YMMXsMDQ3VxIkTnXq80XCk6R/KHnDKlSsnSfrf//6nhQsX6uGHH1a1atWUnp6ud955R61bt9aPP/5o3pGRn5+vTp06adWqVerRo4eeeeYZnT59WitWrNCuXbscfhPr2bOnOnbs6LDe2NjYQvt55ZVX5ObmphEjRujYsWOaMmWKIiIilJKSIh8fH0kXP5A7dOigsLAwjRo1Su7u7po1a5batGmjDRs26M4773Qat3Llyho/frwk6cyZMxo4cGCh637ppZfUrVs39e/fXxkZGZo2bZpatWqlH374Qf7+/k7LDBgwQC1btpQkffHFF/ryyy8d5j/55JNKSEhQ37599fTTTys1NVXTp0/XDz/8oI0bN6pkyZKF7odrkZmZaW7bpQoKCvTAAw/o22+/1YABA1S7dm3t3LlTkydP1k8//aSFCxde03pmzZqlF198UW+88YYeeeSRQmv+aH/Ex8frgQceUK9evZSXl6fPPvtMDz/8sBYvXqyoqCizbsyYMRo9erTuuusujR07Vp6entq8ebNWr16tdu3aSbr4wf3444+rbt26io2Nlb+/v3744QclJiaa/dn3fdOmTTV+/Hilp6crPj5eGzdudPqeli9fXpMnT5Yk/frrr4qPj1fHjh11+PDhQr/3l/ozy9pt2bJFXbp0UUhIiJYuXarSpUs71YwePVpjxoxRRESEBg4cqH379untt9/W1q1bi/TzNGXKFJ05c0aStGfPHr366qt6/vnnVbt2bUmSr6+vQ/1HH32k06dPKzo6WufOnVN8fLzatGmjnTt3KiAgQJK0cuVKdejQQbfddptGjx6ts2fPatq0aWrRooW+//57M4hdqmXLlhowYIBDH3+ls2fP6p577tHPP/+sQYMGqVq1apo/f7769OmjzMxMPfPMM1dc9ueff9Z77713Teuzvy9yc3O1bNkyvf766/L29ta4ceOKvA0nTpzQt99+q2+//VaPP/64wsLCtGrVKsXGxurgwYOaOXOmWdu/f3/Nnj1bXbt21bPPPqvNmzdr/Pjx2rNnj9N7dP/+/erevbueeuop9e7dW7NmzdLDDz+sxMRE3XfffYX2cvjwYbVv3161atXSvHnzVKLExUjx+++/q3Xr1vrtt9/05JNPqkqVKtq0aZNiY2N19OhRTZkypcjb73IGbmqzZs0yJBkrV640MjIyjMOHDxufffaZUa5cOcPHx8f49ddfDcMwjHPnzhn5+fkOy6amphpeXl7G2LFjzWkffvihIcl48803ndZVUFBgLifJmDRpklNN3bp1jdatW5tfr1mzxpBk3HrrrUZ2drY5fd68eYYkIz4+3hy7evXqRmRkpLkewzCM33//3ahWrZpx3333Oa3rrrvuMurVq2d+nZGRYUgyRo0aZU47ePCg4eHhYbzyyisOy+7cudMoUaKE0/T9+/cbkozZs2eb00aNGmVc+lbasGGDIcn45JNPHJZNTEx0ml61alUjKirKqffo6Gjj8rfn5b0PHz7cqFixohEWFuawT//73/8a7u7uxoYNGxyWnzlzpiHJ2Lhxo9P6LtW6dWtzvCVLlhglSpQwnn322UJrrewPw7j4fbpUXl6eUa9ePaNNmzYOY7m7uxsPPvig08+i/XuemZlplC5d2mjWrJlx9uzZQmvy8vKMihUrGvXq1XOoWbx4sSHJiIuLM6f17t3bqFq1qsM47777riHJ2LJlS6HbfD2Wte+jDRs2GOXKlTMkGQMGDCi09tixY4anp6fRrl07h/0yffp0Q5Lx4YcfmtNat25t1K1b12mMSZMmGZKM1NRUp3n29+CaNWuc5tnfy5d+VhiGYWzevNmQZAwdOtSc1qhRI6NixYrGiRMnzGnbt2833N3djccee8xp7FtvvdXo27fvVfu40vujsB6tfN5MmTLFkGR8/PHH5rS8vDwjPDzc8PX1NT+D7GPOmjXLrOvWrZtRr149Izg42Ojdu7elni5d3jAMIygoyOjYsaP5tf3zeevWrVcc69L3o/1rScbo0aMd6vr06WNIMnbu3GkYhmGkpKQYkoz+/fs71D333HOGJGP16tXmtKpVqxqSjAULFpjTsrKyjEqVKhl33HGHU7+pqanGyZMnjTp16hg1a9Y0jh8/7rCOcePGGaVKlTJ++uknh+kjR440PDw8jEOHDl1xe4s7Ts/9Q0RERKhChQoKDg5Wjx495Ovrqy+
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение классов в Тестовой выборке:\n",
|
|||
|
"HeartDisease\n",
|
|||
|
"No 87649\n",
|
|||
|
"Yes 8290\n",
|
|||
|
"Name: count, dtype: int64\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAk0AAAHHCAYAAACiOWx7AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABGQ0lEQVR4nO3dd3gUdfv+/XOTkCIhhJYmAXLTS25KKAYEBCKhyRdFKSLSBNQAYpCm0kGaSBURC6CCIiBFkNCbiJQgTQGRO0iRhFCSQCiBZJ4//GUelg04BGSDvl/HscfBzlzzmWtms5uTmdmJzTAMQwAAALgjF2c3AAAA8DAgNAEAAFhAaAIAALCA0AQAAGABoQkAAMACQhMAAIAFhCYAAAALCE0AAAAWuDm7AQAA/q2uXr2q8+fPy83NTX5+fs5uB3+BI00AgH+cadOmKSkpyXw+adIkpaamOq+hm6xdu1bNmzeXr6+vvLy89Oijj+q1115zdluwgNAES2bPni2bzWY+PD09VapUKfXo0UMJCQnObg/4Vzl27Jjd+/FOj2PHjjm7Xaf49ttvNXToUJ04cUJz587VoEGD5OXl5ey2NH36dEVGRio5OVmTJ0/WmjVrtGbNGg0fPtzZrcECTs/hrgwfPlwhISG6evWqvv/+e33wwQf67rvvdODAAT3yyCPObg/4VyhUqJA+//xzu2kTJkzQyZMnNXHiRIfaf6M333xTzZs31+TJk+Xi4qIJEybIxcW5xwmOHDmi6OhodevWTdOnT5fNZnNqP7h7Nv5gL6yYPXu2OnXqpJ07d6pq1arm9D59+ui9997TvHnz1LZtWyd2CPy7NWvWTAcOHPjXHlnKSlJSkg4ePKjg4GAVLlzY2e2oZ8+e+vbbb3XkyBHlypXL2e0gGzg9h3tSv359SVJcXJwk6fz583rjjTcUGhoqb29v+fj4qHHjxtq7d6/DslevXtXQoUNVqlQpeXp6KjAwUM8884yOHj0q6a9PQTzxxBPmWBs3bpTNZtP8+fP15ptvKiAgQLlz51bz5s114sQJh3Vv375djRo1Ut68efXII4+obt262rp1a5bb+MQTT2S5/qFDhzrUfvHFFwoLC5OXl5fy58+vNm3aZLn+O23bzTIyMjRp0iSVL19enp6e8vf3V/fu3XXhwgW7umLFiqlZs2YO6+nRo4fDmFn1Pn78eId9KknXrl3TkCFDVKJECXl4eCg4OFj9+vXTtWvXstxXN3viiSccxhs1apRcXFw0b968bO2Pd999VzVr1lSBAgXk5eWlsLAwLVy4MMv1f/HFF6pevboeeeQR5cuXT3Xq1NHq1avtalauXKm6desqT5488vHxUbVq1Rx6W7BggfmaFixYUC+88IJOnTplV9OxY0e7nvPly6cnnnhCW7Zs+cv9dC/L3o2kpCT17t1bwcHB8vDwUIkSJTR27FhlZGTY1WVkZGjy5MkKDQ2Vp6enChUqpEaNGmnXrl2S9JenA29+zc+cOaMuXbrI399fnp6eqlixoubMmWO3vltf+1y5cqlYsWLq27ev0tLS7Gr/97//6bnnnlP+/Pn1yCOP6LHHHtOKFSvsajI/CzZu3ChfX1+Fh4ercOHCatq06W3ft1ktn/nw8PBQqVKlNHr0aN18jGHo0KGy2Ww6e/bsbccqVqyYOnbsaD7/8ccfFRYWpldffVX+/v7y8PBQhQoV9NFHHzksm5qaqj59+pivV+nSpfXuu+/q1uMcNptNPXr00Ny5c1W6dGl5enoqLCxMmzdvtqvL7PdmGzZskIeHh15++WW76adOnVLnzp3NHsuXL69PP/30jvvt34LTc7gnmQGnQIECkv78UFuyZImee+45hYSEKCEhQR9++KHq1q2rX375RUFBQZKk9PR0NWvWTOvWrVObNm302muv6eLFi1qzZo0OHDig4sWLm+to27atmjRpYrfegQMHZtnPqFGjZLPZ1L9/f505c0aTJk1SRESE9uzZY17PsH79ejVu3FhhYWEaMmSIXFxcNGvWLNWvX19btmxR9erVHcYtXLiwRo8eLUm6dOmSXnnllSzXPWjQILVq1UovvfSSEhMTNXXqVNWpU0c//fSTfH19HZbp1q2bateuLUn65ptvtHjxYrv53bt3N4/y9erVS3FxcZo2bZp++uknbd269b78bzUpKcnctptlZGSoefPm+v7779WtWzeVLVtW+/fv18SJE/Xrr79qyZIld7WeWbNm6e2339aECRP0/PPPZ1nzV/tj8uTJat68udq1a6e0tDR99dVXeu6557R8+XI1bdrUrBs2bJiGDh2qmjVravjw4XJ3d9f27du1fv16NWzYUNKfR087d+6s8uXLa+DAgfL19dVPP/2kmJgYs7/MfV+tWjWNHj1aCQkJmjx5srZu3erwmhYsWNA8NXby5ElNnjxZTZo00YkTJ7J87W92L8tacfnyZdWtW1enTp1S9+7dVaRIEf3www8aOHCgTp8+rUmTJpm1Xbp00ezZs9W4cWO99NJLunHjhrZs2aIff/xRVatWtTstuGXLFs2cOVMTJ05UwYIFJUn+/v6SpCtXruiJJ57Qb7/9ph49eigkJEQLFixQx44dlZSU5HDhc+Zrf+3aNa1atUrvvvuuPD09NWLECElSQkKCatasqcuXL6tXr14qUKCA5syZo+bNm2vhwoV6+umnb7v9mzdv1nfffXdX++zNN99U2bJldeXKFfM/Y35+furSpctdjXOzc+fOadeuXXJzc1NUVJSKFy+uJUuWqFu3bjp37pwGDBggSTIMQ82bN9eGDRvUpUsXVapUSatWrVLfvn116tQph1OwmzZt0vz589WrVy95eHho+vTpatSokXbs2KEKFSpk2cvevXvVokULNWnSRO+//745PSEhQY899pgZxgoVKqSVK1eqS5cuSklJUe/evbO9/f8IBmDBrFmzDEnG2rVrjcTEROPEiRPGV199ZRQoUMDw8vIyTp48aRiGYVy9etVIT0+3WzYuLs7w8PAwhg8fbk779NNPDUnGe++957CujIwMczlJxvjx4x1qypcvb9StW9d8vmHDBkOS8eijjxopKSnm9K+//tqQZEyePNkcu2TJkkZkZKS5HsMwjMuXLxshISHGk08+6bCumjVrGhUqVDCfJyYmGpKMIUOGmNOOHTtmuLq6GqNGjbJbdv/+/Yabm5vD9CNHjhiSjDlz5pjThgwZYtz8ltyyZYshyZg7d67dsjExMQ7TixYtajRt2tSh96ioKOPWt/mtvffr18/w8/MzwsLC7Pbp559/bri4uBhbtmyxW37GjBmGJGPr1q0O67tZ3bp1zfFWrFhhuLm5GX369Mmy1sr+MIw/X6ebpaWlGRUqVDDq169vN5aLi4vx9NNPO/wsZr7mSUlJRp48eYwaNWoYV65cybImLS3N8PPzMypUqGBXs3z5ckOSMXjwYHNahw4djKJFi9qNM3PmTEOSsWPHjiy3+X4se7OmTZs6jJNpxIgRRu7cuY1ff/3VbvqAAQMMV1dX4/jx44ZhGMb69esNSUavXr0cxrj5/ZIp83MhLi7OYd6kSZMMScYXX3xhTktLSzPCw8MNb29v832a+T6fNWuW3fJBQUFGkyZNzOe9e/c2JNn9PF68eNEICQkxihUrZr7WmZ8FGzZsMOtq1KhhNG7c2OFnPytZLX/16lXDxcXFePXVV81pmT+fiYmJtx2raNGiRocOHeyeSzJmz55tTrtx44bRoEEDw8PDwzh79qxhGIaxZMkSQ5IxcuRIu/GeffZZw2azGb/99ps5TZIhydi1a5c57ffffzc8PT2Np59+2qFfw/jz8yowMNB4/PHHHX7+u3TpYgQGBpq9ZGrTpo2RN29eh/fgvw2n53BXIiIiVKhQIQUHB6tNmzby9vbW4sWL9eijj0qSPDw8zIst09PTde7cOXl7e6t06dLavXu3Oc6iRYtUsGBB9ezZ02Ed93Jx5Isvvqg8efKYz5999lkFBgaa/8vcs2ePjhw5oueff17nzp3T2bNndfbsWaWmpqpBgwbavHmzw+mKq1evytPT847r/eabb5SRkaFWrVqZY549e1YBAQE
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"\n",
|
|||
|
"# Проверка распределения классов в целевой переменной\n",
|
|||
|
"class_distribution = df['HeartDisease'].value_counts()\n",
|
|||
|
"print(\"Распределение классов в HeartDisease:\")\n",
|
|||
|
"print(class_distribution)\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация распределения классов\n",
|
|||
|
"sns.countplot(x='HeartDisease', data=df)\n",
|
|||
|
"plt.title('Распределение классов в HeartDisease')\n",
|
|||
|
"plt.show()\n",
|
|||
|
"\n",
|
|||
|
"# Проверка сбалансированности для каждой выборки\n",
|
|||
|
"def check_balance(df, title):\n",
|
|||
|
" class_distribution = df['HeartDisease'].value_counts()\n",
|
|||
|
" print(f\"Распределение классов в {title}:\")\n",
|
|||
|
" print(class_distribution)\n",
|
|||
|
" sns.countplot(x='HeartDisease', data=df)\n",
|
|||
|
" plt.title(f'Распределение классов в {title}')\n",
|
|||
|
" plt.show()\n",
|
|||
|
"\n",
|
|||
|
"# Проверка сбалансированности для обучающей, контрольной и тестовой выборок\n",
|
|||
|
"check_balance(train_df, 'Обучающей выборке')\n",
|
|||
|
"check_balance(val_df, 'Контрольной выборке')\n",
|
|||
|
"check_balance(test_df, 'Тестовой выборке')"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Можно заметить, что данные не сбалансированы - во всех выборках количество значений \"No\" превышает \"Yes\" в среднем в 10 раз. Для балансировки данных будет применен метод upsampling "
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 6,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки до upsampling: 156699\n",
|
|||
|
"Размер контрольной выборки: 67157\n",
|
|||
|
"Размер тестовой выборки: 95939\n",
|
|||
|
"\n",
|
|||
|
"Распределение классов в всем датасете:\n",
|
|||
|
"Класс No: 292422 (91.44%)\n",
|
|||
|
"Класс Yes: 27373 (8.56%)\n",
|
|||
|
"\n",
|
|||
|
"Распределение классов в Обучающей выборке до upsampling:\n",
|
|||
|
"Класс No: 143331 (91.47%)\n",
|
|||
|
"Класс Yes: 13368 (8.53%)\n",
|
|||
|
"Размер обучающей выборки после upsampling: 286662\n",
|
|||
|
"\n",
|
|||
|
"Распределение классов в Обучающей выборке после upsampling:\n",
|
|||
|
"Класс No: 143331 (50.00%)\n",
|
|||
|
"Класс Yes: 143331 (50.00%)\n",
|
|||
|
"\n",
|
|||
|
"Распределение классов в Контрольной выборке:\n",
|
|||
|
"Класс No: 61442 (91.49%)\n",
|
|||
|
"Класс Yes: 5715 (8.51%)\n",
|
|||
|
"\n",
|
|||
|
"Распределение классов в Тестовой выборке:\n",
|
|||
|
"Класс No: 87649 (91.36%)\n",
|
|||
|
"Класс Yes: 8290 (8.64%)\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"from imblearn.over_sampling import RandomOverSampler\n",
|
|||
|
"\n",
|
|||
|
"# Разделение на обучающую и тестовую выборки (например, 70% обучающая, 30% тестовая)\n",
|
|||
|
"train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение обучающей выборки на обучающую и контрольную (например, 70% обучающая, 30% контрольная)\n",
|
|||
|
"train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Вывод размеров выборок\n",
|
|||
|
"print(\"Размер обучающей выборки до upsampling:\", len(train_df))\n",
|
|||
|
"print(\"Размер контрольной выборки:\", len(val_df))\n",
|
|||
|
"print(\"Размер тестовой выборки:\", len(test_df))\n",
|
|||
|
"\n",
|
|||
|
"# Функция для проверки балансировки данных\n",
|
|||
|
"def check_balance(df, title):\n",
|
|||
|
" class_distribution = df['HeartDisease'].value_counts()\n",
|
|||
|
" print(f\"\\nРаспределение классов в {title}:\")\n",
|
|||
|
" for cls, count in class_distribution.items():\n",
|
|||
|
" print(f\"Класс {cls}: {count} ({count / len(df) * 100:.2f}%)\")\n",
|
|||
|
"\n",
|
|||
|
"# Проверка балансировки для всего датасета\n",
|
|||
|
"check_balance(df, 'всем датасете')\n",
|
|||
|
"\n",
|
|||
|
"# Проверка балансировки для обучающей выборки до upsampling\n",
|
|||
|
"check_balance(train_df, 'Обучающей выборке до upsampling')\n",
|
|||
|
"\n",
|
|||
|
"# Применение upsampling к обучающей выборке\n",
|
|||
|
"X_train = train_df.drop('HeartDisease', axis=1) # Отделяем признаки от целевой переменной\n",
|
|||
|
"y_train = train_df['HeartDisease'] # Целевая переменная\n",
|
|||
|
"\n",
|
|||
|
"# Инициализация RandomOverSampler\n",
|
|||
|
"ros = RandomOverSampler(random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Применение upsampling\n",
|
|||
|
"X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)\n",
|
|||
|
"\n",
|
|||
|
"# Создание нового DataFrame с балансированными данными\n",
|
|||
|
"train_df_resampled = pd.concat([X_train_resampled, y_train_resampled], axis=1)\n",
|
|||
|
"\n",
|
|||
|
"# Вывод размеров выборок после upsampling\n",
|
|||
|
"print(\"Размер обучающей выборки после upsampling:\", len(train_df_resampled))\n",
|
|||
|
"\n",
|
|||
|
"# Проверка балансировки для обучающей выборки после upsampling\n",
|
|||
|
"check_balance(train_df_resampled, 'Обучающей выборке после upsampling')\n",
|
|||
|
"\n",
|
|||
|
"# Проверка балансировки для контрольной и тестовой выборок (они не должны измениться)\n",
|
|||
|
"check_balance(val_df, 'Контрольной выборке')\n",
|
|||
|
"check_balance(test_df, 'Тестовой выборке')"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Данные были сбалансированы. Теперь можно перейти к конструированию признаков. Поставлены следующие задачи:\n",
|
|||
|
"\n",
|
|||
|
"1. Разработка персонализированных программ профилактики сердечно-сосудистых заболеваний\n",
|
|||
|
"2. Улучшение качества медицинской помощи"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Унитарное кодирование категориальных признаков (one-hot encoding)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 17,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"from imblearn.over_sampling import RandomOverSampler\n",
|
|||
|
"\n",
|
|||
|
"# Определение категориальных признаков\n",
|
|||
|
"categorical_features = [\n",
|
|||
|
" 'Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'Sex', 'AgeCategory', 'Race',\n",
|
|||
|
" 'Diabetic', 'PhysicalActivity', 'GenHealth', 'Asthma', 'KidneyDisease', 'SkinCancer'\n",
|
|||
|
"]\n",
|
|||
|
"\n",
|
|||
|
"# Применение one-hot encoding к обучающей выборке\n",
|
|||
|
"train_df_resampled_encoded = pd.get_dummies(train_df_resampled, columns=categorical_features)\n",
|
|||
|
"\n",
|
|||
|
"# Применение one-hot encoding к контрольной выборке\n",
|
|||
|
"val_df_encoded = pd.get_dummies(val_df, columns=categorical_features)\n",
|
|||
|
"\n",
|
|||
|
"# Применение one-hot encoding к тестовой выборке\n",
|
|||
|
"test_df_encoded = pd.get_dummies(test_df, columns=categorical_features)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Дискретизация числовых признаков"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 7,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"from imblearn.over_sampling import RandomOverSampler\n",
|
|||
|
"\n",
|
|||
|
"# Загрузка данных\n",
|
|||
|
"df = pd.read_csv(\"..//static//csv//heart_2020_cleaned.csv\")\n",
|
|||
|
"\n",
|
|||
|
"# Разделение на обучающую и тестовую выборки (например, 70% обучающая, 30% тестовая)\n",
|
|||
|
"train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение обучающей выборки на обучающую и контрольную (например, 70% обучающая, 30% контрольная)\n",
|
|||
|
"train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Применение upsampling к обучающей выборке\n",
|
|||
|
"X_train = train_df.drop('HeartDisease', axis=1) # Отделяем признаки от целевой переменной\n",
|
|||
|
"y_train = train_df['HeartDisease'] # Целевая переменная\n",
|
|||
|
"\n",
|
|||
|
"# Инициализация RandomOverSampler\n",
|
|||
|
"ros = RandomOverSampler(random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Применение upsampling\n",
|
|||
|
"X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)\n",
|
|||
|
"\n",
|
|||
|
"# Создание нового DataFrame с балансированными данными\n",
|
|||
|
"train_df_resampled = pd.concat([X_train_resampled, y_train_resampled], axis=1)\n",
|
|||
|
"\n",
|
|||
|
"# Определение числовых признаков для дискретизации\n",
|
|||
|
"numerical_features = ['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime']\n",
|
|||
|
"\n",
|
|||
|
"# Функция для дискретизации числовых признаков\n",
|
|||
|
"def discretize_features(df, features, bins=5, labels=False):\n",
|
|||
|
" for feature in features:\n",
|
|||
|
" df[f'{feature}_bin'] = pd.cut(df[feature], bins=bins, labels=labels)\n",
|
|||
|
" return df\n",
|
|||
|
"\n",
|
|||
|
"# Применение дискретизации к обучающей, контрольной и тестовой выборкам\n",
|
|||
|
"train_df_resampled = discretize_features(train_df_resampled, numerical_features)\n",
|
|||
|
"val_df = discretize_features(val_df, numerical_features)\n",
|
|||
|
"test_df = discretize_features(test_df, numerical_features)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Ручной синтез. \n",
|
|||
|
"Создание новых признаков на основе экспертных знаний и логики предметной области. Например, для данных о продаже автомобилей можно создать признак \"возраст автомобиля\" как разницу между текущим годом и годом выпуска."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 10,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"\n",
|
|||
|
"# Загрузка данных\n",
|
|||
|
"df = pd.read_csv(\"..//static//csv//heart_2020_cleaned.csv\")\n",
|
|||
|
"\n",
|
|||
|
"# Разделение на обучающую и тестовую выборки (например, 70% обучающая, 30% тестовая)\n",
|
|||
|
"train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение обучающей выборки на обучающую и контрольную (например, 70% обучающая, 30% контрольная)\n",
|
|||
|
"train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Применение upsampling к обучающей выборке\n",
|
|||
|
"X_train = train_df.drop('HeartDisease', axis=1) # Отделяем признаки от целевой переменной\n",
|
|||
|
"y_train = train_df['HeartDisease'] # Целевая переменная\n",
|
|||
|
"\n",
|
|||
|
"# Инициализация RandomOverSampler\n",
|
|||
|
"ros = RandomOverSampler(random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Применение upsampling\n",
|
|||
|
"X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)\n",
|
|||
|
"\n",
|
|||
|
"# Создание нового DataFrame с балансированными данными\n",
|
|||
|
"train_df_resampled = pd.concat([X_train_resampled, y_train_resampled], axis=1)\n",
|
|||
|
"\n",
|
|||
|
"# Создание нового признака \"Age\" на основе категориального признака \"AgeCategory\"\n",
|
|||
|
"age_mapping = {\n",
|
|||
|
" '18-24': 21,\n",
|
|||
|
" '25-29': 27,\n",
|
|||
|
" '30-34': 32,\n",
|
|||
|
" '35-39': 37,\n",
|
|||
|
" '40-44': 42,\n",
|
|||
|
" '45-49': 47,\n",
|
|||
|
" '50-54': 52,\n",
|
|||
|
" '55-59': 57,\n",
|
|||
|
" '60-64': 62,\n",
|
|||
|
" '65-69': 67,\n",
|
|||
|
" '70-74': 72,\n",
|
|||
|
" '75-79': 77,\n",
|
|||
|
" '80 or older': 80\n",
|
|||
|
"}\n",
|
|||
|
"\n",
|
|||
|
"train_df_resampled['Age'] = train_df_resampled['AgeCategory'].map(age_mapping)\n",
|
|||
|
"val_df['Age'] = val_df['AgeCategory'].map(age_mapping)\n",
|
|||
|
"test_df['Age'] = test_df['AgeCategory'].map(age_mapping)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Масштабирование признаков - это процесс преобразования числовых признаков таким образом, чтобы они имели одинаковый масштаб. Это важно для многих алгоритмов машинного обучения, которые чувствительны к масштабу признаков, таких как линейная регрессия, метод опорных векторов (SVM) и нейронные сети."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 16,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"from imblearn.over_sampling import RandomOverSampler\n",
|
|||
|
"from sklearn.preprocessing import StandardScaler\n",
|
|||
|
"\n",
|
|||
|
"# Загрузка данных\n",
|
|||
|
"df = pd.read_csv(\"..//static//csv//heart_2020_cleaned.csv\")\n",
|
|||
|
"\n",
|
|||
|
"# Разделение на обучающую и тестовую выборки (например, 70% обучающая, 30% тестовая)\n",
|
|||
|
"train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение обучающей выборки на обучающую и контрольную (например, 70% обучающая, 30% контрольная)\n",
|
|||
|
"train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Применение upsampling к обучающей выборке\n",
|
|||
|
"X_train = train_df.drop('HeartDisease', axis=1) # Отделяем признаки от целевой переменной\n",
|
|||
|
"y_train = train_df['HeartDisease'] # Целевая переменная\n",
|
|||
|
"\n",
|
|||
|
"# Инициализация RandomOverSampler\n",
|
|||
|
"ros = RandomOverSampler(random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Применение upsampling\n",
|
|||
|
"X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)\n",
|
|||
|
"\n",
|
|||
|
"# Создание нового DataFrame с балансированными данными\n",
|
|||
|
"train_df_resampled = pd.concat([X_train_resampled, y_train_resampled], axis=1)\n",
|
|||
|
"\n",
|
|||
|
"# Создание нового признака \"Age\" на основе категориального признака \"AgeCategory\"\n",
|
|||
|
"age_mapping = {\n",
|
|||
|
" '18-24': 21,\n",
|
|||
|
" '25-29': 27,\n",
|
|||
|
" '30-34': 32,\n",
|
|||
|
" '35-39': 37,\n",
|
|||
|
" '40-44': 42,\n",
|
|||
|
" '45-49': 47,\n",
|
|||
|
" '50-54': 52,\n",
|
|||
|
" '55-59': 57,\n",
|
|||
|
" '60-64': 62,\n",
|
|||
|
" '65-69': 67,\n",
|
|||
|
" '70-74': 72,\n",
|
|||
|
" '75-79': 77,\n",
|
|||
|
" '80 or older': 80\n",
|
|||
|
"}\n",
|
|||
|
"\n",
|
|||
|
"train_df_resampled['Age'] = train_df_resampled['AgeCategory'].map(age_mapping)\n",
|
|||
|
"val_df['Age'] = val_df['AgeCategory'].map(age_mapping)\n",
|
|||
|
"test_df['Age'] = test_df['AgeCategory'].map(age_mapping)\n",
|
|||
|
"\n",
|
|||
|
"# Определение числовых признаков для масштабирования\n",
|
|||
|
"numerical_features_to_scale = ['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime', 'Age']\n",
|
|||
|
"\n",
|
|||
|
"# Инициализация StandardScaler\n",
|
|||
|
"scaler = StandardScaler()\n",
|
|||
|
"\n",
|
|||
|
"# Масштабирование числовых признаков в обучающей выборке\n",
|
|||
|
"train_df_resampled[numerical_features_to_scale] = scaler.fit_transform(train_df_resampled[numerical_features_to_scale])\n",
|
|||
|
"\n",
|
|||
|
"# Масштабирование числовых признаков в контрольной и тестовой выборках\n",
|
|||
|
"val_df[numerical_features_to_scale] = scaler.transform(val_df[numerical_features_to_scale])\n",
|
|||
|
"test_df[numerical_features_to_scale] = scaler.transform(test_df[numerical_features_to_scale])"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Конструирование признаков с применением фреймворка Featuretools"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 19,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:1733: UserWarning: index id not found in dataframe, creating new integer column\n",
|
|||
|
" warnings.warn(\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n",
|
|||
|
" warnings.warn(\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
|
|||
|
" series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
|
|||
|
" series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Обучающая выборка после конструирования признаков:\n",
|
|||
|
" HeartDisease BMI Smoking AlcoholDrinking Stroke PhysicalHealth \\\n",
|
|||
|
"id \n",
|
|||
|
"0 False 32.23 False False False 0.0 \n",
|
|||
|
"1 False 29.53 False False False 0.0 \n",
|
|||
|
"2 False 30.13 False False False 0.0 \n",
|
|||
|
"3 False 35.43 False False False 0.0 \n",
|
|||
|
"4 False 29.53 False False False 0.0 \n",
|
|||
|
"\n",
|
|||
|
" MentalHealth DiffWalking Sex AgeCategory Race Diabetic \\\n",
|
|||
|
"id \n",
|
|||
|
"0 0.0 True Male 75-79 White Yes \n",
|
|||
|
"1 0.0 True Female 50-54 White No \n",
|
|||
|
"2 0.0 False Male 50-54 White No \n",
|
|||
|
"3 15.0 False Female 18-24 Hispanic No \n",
|
|||
|
"4 0.0 True Female 65-69 White Yes \n",
|
|||
|
"\n",
|
|||
|
" PhysicalActivity GenHealth SleepTime Asthma KidneyDisease SkinCancer \\\n",
|
|||
|
"id \n",
|
|||
|
"0 True Fair 6.0 False True True \n",
|
|||
|
"1 True Very good 8.0 False False False \n",
|
|||
|
"2 True Excellent 7.0 False False False \n",
|
|||
|
"3 True Good 7.0 False False False \n",
|
|||
|
"4 False Good 10.0 False False False \n",
|
|||
|
"\n",
|
|||
|
" Age \n",
|
|||
|
"id \n",
|
|||
|
"0 77 \n",
|
|||
|
"1 52 \n",
|
|||
|
"2 52 \n",
|
|||
|
"3 21 \n",
|
|||
|
"4 67 \n",
|
|||
|
"Контрольная выборка после конструирования признаков:\n",
|
|||
|
" HeartDisease BMI Smoking AlcoholDrinking Stroke PhysicalHealth \\\n",
|
|||
|
"id \n",
|
|||
|
"80125 False 22.71 False False False 0.0 \n",
|
|||
|
"116296 False 25.80 False False False 0.0 \n",
|
|||
|
"18780 False 17.74 True False False 0.0 \n",
|
|||
|
"233006 <NA> NaN <NA> <NA> <NA> NaN \n",
|
|||
|
"182306 <NA> NaN <NA> <NA> <NA> NaN \n",
|
|||
|
"\n",
|
|||
|
" MentalHealth DiffWalking Sex AgeCategory Race Diabetic \\\n",
|
|||
|
"id \n",
|
|||
|
"80125 0.0 False Male 25-29 White No \n",
|
|||
|
"116296 0.0 False Male 50-54 Hispanic No \n",
|
|||
|
"18780 0.0 False Female 65-69 White No \n",
|
|||
|
"233006 NaN <NA> NaN NaN NaN NaN \n",
|
|||
|
"182306 NaN <NA> NaN NaN NaN NaN \n",
|
|||
|
"\n",
|
|||
|
" PhysicalActivity GenHealth SleepTime Asthma KidneyDisease \\\n",
|
|||
|
"id \n",
|
|||
|
"80125 True Excellent 8.0 False False \n",
|
|||
|
"116296 True Good 7.0 False False \n",
|
|||
|
"18780 True Good 7.0 False False \n",
|
|||
|
"233006 <NA> NaN NaN <NA> <NA> \n",
|
|||
|
"182306 <NA> NaN NaN <NA> <NA> \n",
|
|||
|
"\n",
|
|||
|
" SkinCancer Age \n",
|
|||
|
"id \n",
|
|||
|
"80125 False 27 \n",
|
|||
|
"116296 False 52 \n",
|
|||
|
"18780 False 67 \n",
|
|||
|
"233006 <NA> <NA> \n",
|
|||
|
"182306 <NA> <NA> \n",
|
|||
|
"Тестовая выборка после конструирования признаков:\n",
|
|||
|
" HeartDisease BMI Smoking AlcoholDrinking Stroke PhysicalHealth \\\n",
|
|||
|
"id \n",
|
|||
|
"271884 <NA> NaN <NA> <NA> <NA> NaN \n",
|
|||
|
"270361 <NA> NaN <NA> <NA> <NA> NaN \n",
|
|||
|
"219060 <NA> NaN <NA> <NA> <NA> NaN \n",
|
|||
|
"24010 False 26.5 False False False 14.0 \n",
|
|||
|
"181930 <NA> NaN <NA> <NA> <NA> NaN \n",
|
|||
|
"\n",
|
|||
|
" MentalHealth DiffWalking Sex AgeCategory Race Diabetic \\\n",
|
|||
|
"id \n",
|
|||
|
"271884 NaN <NA> NaN NaN NaN NaN \n",
|
|||
|
"270361 NaN <NA> NaN NaN NaN NaN \n",
|
|||
|
"219060 NaN <NA> NaN NaN NaN NaN \n",
|
|||
|
"24010 0.0 True Male 75-79 White Yes \n",
|
|||
|
"181930 NaN <NA> NaN NaN NaN NaN \n",
|
|||
|
"\n",
|
|||
|
" PhysicalActivity GenHealth SleepTime Asthma KidneyDisease \\\n",
|
|||
|
"id \n",
|
|||
|
"271884 <NA> NaN NaN <NA> <NA> \n",
|
|||
|
"270361 <NA> NaN NaN <NA> <NA> \n",
|
|||
|
"219060 <NA> NaN NaN <NA> <NA> \n",
|
|||
|
"24010 True Excellent 9.0 False False \n",
|
|||
|
"181930 <NA> NaN NaN <NA> <NA> \n",
|
|||
|
"\n",
|
|||
|
" SkinCancer Age \n",
|
|||
|
"id \n",
|
|||
|
"271884 <NA> <NA> \n",
|
|||
|
"270361 <NA> <NA> \n",
|
|||
|
"219060 <NA> <NA> \n",
|
|||
|
"24010 False 77 \n",
|
|||
|
"181930 <NA> <NA> \n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"import featuretools as ft\n",
|
|||
|
"\n",
|
|||
|
"# Загрузка данных\n",
|
|||
|
"df = pd.read_csv(\"..//static//csv//heart_2020_cleaned.csv\")\n",
|
|||
|
"\n",
|
|||
|
"# Разделение на обучающую и тестовую выборки (например, 70% обучающая, 30% тестовая)\n",
|
|||
|
"train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение обучающей выборки на обучающую и контрольную (например, 70% обучающая, 30% контрольная)\n",
|
|||
|
"train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Создание нового признака \"Age\" на основе категориального признака \"AgeCategory\"\n",
|
|||
|
"age_mapping = {\n",
|
|||
|
" '18-24': 21,\n",
|
|||
|
" '25-29': 27,\n",
|
|||
|
" '30-34': 32,\n",
|
|||
|
" '35-39': 37,\n",
|
|||
|
" '40-44': 42,\n",
|
|||
|
" '45-49': 47,\n",
|
|||
|
" '50-54': 52,\n",
|
|||
|
" '55-59': 57,\n",
|
|||
|
" '60-64': 62,\n",
|
|||
|
" '65-69': 67,\n",
|
|||
|
" '70-74': 72,\n",
|
|||
|
" '75-79': 77,\n",
|
|||
|
" '80 or older': 80\n",
|
|||
|
"}\n",
|
|||
|
"\n",
|
|||
|
"train_df['Age'] = train_df['AgeCategory'].map(age_mapping)\n",
|
|||
|
"val_df['Age'] = val_df['AgeCategory'].map(age_mapping)\n",
|
|||
|
"test_df['Age'] = test_df['AgeCategory'].map(age_mapping)\n",
|
|||
|
"\n",
|
|||
|
"# Определение сущностей\n",
|
|||
|
"es = ft.EntitySet(id='heart_data')\n",
|
|||
|
"es = es.add_dataframe(dataframe_name='train', dataframe=train_df, index='id')\n",
|
|||
|
"\n",
|
|||
|
"# Генерация признаков\n",
|
|||
|
"feature_matrix, feature_defs = ft.dfs(entityset=es, target_dataframe_name='train', max_depth=2)\n",
|
|||
|
"\n",
|
|||
|
"# Преобразование признаков для контрольной и тестовой выборок\n",
|
|||
|
"val_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=val_df.index)\n",
|
|||
|
"test_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=test_df.index)\n",
|
|||
|
"\n",
|
|||
|
"# Вывод первых нескольких строк для проверки\n",
|
|||
|
"print(\"Обучающая выборка после конструирования признаков:\")\n",
|
|||
|
"print(feature_matrix.head())\n",
|
|||
|
"print(\"Контрольная выборка после конструирования признаков:\")\n",
|
|||
|
"print(val_feature_matrix.head())\n",
|
|||
|
"print(\"Тестовая выборка после конструирования признаков:\")\n",
|
|||
|
"print(test_feature_matrix.head())"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Оценка качества каждого набора признаков\n",
|
|||
|
"\n",
|
|||
|
"Предсказательная способность\n",
|
|||
|
"Метрики: RMSE, MAE, R²\n",
|
|||
|
"\n",
|
|||
|
"Методы: Обучение модели на обучающей выборке и оценка на контрольной и тестовой выборках.\n",
|
|||
|
"\n",
|
|||
|
"Скорость вычисления\n",
|
|||
|
"Методы: Измерение времени выполнения генерации признаков и обучения модели.\n",
|
|||
|
"\n",
|
|||
|
"Надежность\n",
|
|||
|
"Методы: Кросс-валидация, анализ чувствительности модели к изменениям в данных.\n",
|
|||
|
"\n",
|
|||
|
"Корреляция\n",
|
|||
|
"Методы: Анализ корреляционной матрицы признаков, удаление мультиколлинеарных признаков.\n",
|
|||
|
"\n",
|
|||
|
"Цельность\n",
|
|||
|
"Методы: Проверка логической связи между признаками и целевой переменной, интерпретация результатов модели."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 30,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки: 156699\n",
|
|||
|
"Размер контрольной выборки: 67157\n",
|
|||
|
"Размер тестовой выборки: 95939\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:1733: UserWarning: index id not found in dataframe, creating new integer column\n",
|
|||
|
" warnings.warn(\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n",
|
|||
|
" warnings.warn(\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Feature Importance:\n",
|
|||
|
" feature importance\n",
|
|||
|
"0 HeartDisease 0.851120\n",
|
|||
|
"1 BMI 0.014203\n",
|
|||
|
"9 Stroke_No 0.012600\n",
|
|||
|
"2 PhysicalHealth 0.008628\n",
|
|||
|
"12 DiffWalking_Yes 0.008111\n",
|
|||
|
"11 DiffWalking_No 0.007721\n",
|
|||
|
"36 Diabetic_Yes 0.007583\n",
|
|||
|
"10 Stroke_Yes 0.007551\n",
|
|||
|
"4 SleepTime 0.007525\n",
|
|||
|
"43 GenHealth_Poor 0.006605\n",
|
|||
|
"27 AgeCategory_80 or older 0.006269\n",
|
|||
|
"34 Diabetic_No 0.005300\n",
|
|||
|
"3 MentalHealth 0.005102\n",
|
|||
|
"41 GenHealth_Fair 0.004277\n",
|
|||
|
"48 KidneyDisease_Yes 0.003435\n",
|
|||
|
"47 KidneyDisease_No 0.003086\n",
|
|||
|
"13 Sex_Female 0.002607\n",
|
|||
|
"26 AgeCategory_75-79 0.002567\n",
|
|||
|
"25 AgeCategory_70-74 0.002462\n",
|
|||
|
"14 Sex_Male 0.002457\n",
|
|||
|
"6 Smoking_Yes 0.002127\n",
|
|||
|
"5 Smoking_No 0.001934\n",
|
|||
|
"42 GenHealth_Good 0.001787\n",
|
|||
|
"44 GenHealth_Very good 0.001734\n",
|
|||
|
"33 Race_White 0.001731\n",
|
|||
|
"50 SkinCancer_Yes 0.001687\n",
|
|||
|
"38 PhysicalActivity_No 0.001658\n",
|
|||
|
"39 PhysicalActivity_Yes 0.001585\n",
|
|||
|
"49 SkinCancer_No 0.001513\n",
|
|||
|
"40 GenHealth_Excellent 0.001451\n",
|
|||
|
"24 AgeCategory_65-69 0.001318\n",
|
|||
|
"46 Asthma_Yes 0.001315\n",
|
|||
|
"45 Asthma_No 0.001256\n",
|
|||
|
"23 AgeCategory_60-64 0.001091\n",
|
|||
|
"30 Race_Black 0.000885\n",
|
|||
|
"22 AgeCategory_55-59 0.000853\n",
|
|||
|
"31 Race_Hispanic 0.000825\n",
|
|||
|
"21 AgeCategory_50-54 0.000715\n",
|
|||
|
"32 Race_Other 0.000699\n",
|
|||
|
"7 AlcoholDrinking_No 0.000560\n",
|
|||
|
"8 AlcoholDrinking_Yes 0.000550\n",
|
|||
|
"20 AgeCategory_45-49 0.000520\n",
|
|||
|
"28 Race_American Indian/Alaskan Native 0.000503\n",
|
|||
|
"35 Diabetic_No, borderline diabetes 0.000479\n",
|
|||
|
"19 AgeCategory_40-44 0.000444\n",
|
|||
|
"18 AgeCategory_35-39 0.000412\n",
|
|||
|
"17 AgeCategory_30-34 0.000267\n",
|
|||
|
"29 Race_Asian 0.000260\n",
|
|||
|
"15 AgeCategory_18-24 0.000231\n",
|
|||
|
"16 AgeCategory_25-29 0.000217\n",
|
|||
|
"37 Diabetic_Yes (during pregnancy) 0.000184\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"from imblearn.over_sampling import RandomOverSampler\n",
|
|||
|
"import featuretools as ft\n",
|
|||
|
"from sklearn.ensemble import RandomForestClassifier\n",
|
|||
|
"\n",
|
|||
|
"# Загрузка данных\n",
|
|||
|
"df = pd.read_csv(\"..//static//csv//heart_2020_cleaned.csv\")\n",
|
|||
|
"\n",
|
|||
|
"# Разделение на обучающую и тестовую выборки (например, 70% обучающая, 30% тестовая)\n",
|
|||
|
"train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение обучающей выборки на обучающую и контрольную (например, 70% обучающая, 30% контрольная)\n",
|
|||
|
"train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Вывод размеров выборок\n",
|
|||
|
"print(\"Размер обучающей выборки:\", len(train_df))\n",
|
|||
|
"print(\"Размер контрольной выборки:\", len(val_df))\n",
|
|||
|
"print(\"Размер тестовой выборки:\", len(test_df))\n",
|
|||
|
"\n",
|
|||
|
"# Определение категориальных признаков\n",
|
|||
|
"categorical_features = [\n",
|
|||
|
" 'Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'Sex', 'AgeCategory', 'Race',\n",
|
|||
|
" 'Diabetic', 'PhysicalActivity', 'GenHealth', 'Asthma', 'KidneyDisease', 'SkinCancer'\n",
|
|||
|
"]\n",
|
|||
|
"\n",
|
|||
|
"# Применение one-hot encoding к обучающей выборке\n",
|
|||
|
"train_df_encoded = pd.get_dummies(train_df, columns=categorical_features)\n",
|
|||
|
"\n",
|
|||
|
"# Применение one-hot encoding к контрольной выборке\n",
|
|||
|
"val_df_encoded = pd.get_dummies(val_df, columns=categorical_features)\n",
|
|||
|
"\n",
|
|||
|
"# Применение one-hot encoding к тестовой выборке\n",
|
|||
|
"test_df_encoded = pd.get_dummies(test_df, columns=categorical_features)\n",
|
|||
|
"\n",
|
|||
|
"# Определение сущностей\n",
|
|||
|
"es = ft.EntitySet(id='heart_data')\n",
|
|||
|
"es = es.add_dataframe(dataframe_name='heart', dataframe=train_df_encoded, index='id')\n",
|
|||
|
"\n",
|
|||
|
"# Генерация признаков\n",
|
|||
|
"feature_matrix, feature_defs = ft.dfs(entityset=es, target_dataframe_name='heart', max_depth=2)\n",
|
|||
|
"\n",
|
|||
|
"# Преобразование признаков для контрольной и тестовой выборок\n",
|
|||
|
"val_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=val_df_encoded.index)\n",
|
|||
|
"test_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=test_df_encoded.index)\n",
|
|||
|
"\n",
|
|||
|
"# Оценка важности признаков\n",
|
|||
|
"X = feature_matrix\n",
|
|||
|
"y = train_df_encoded['HeartDisease']\n",
|
|||
|
"\n",
|
|||
|
"# Разделение данных на обучающую и тестовую выборки\n",
|
|||
|
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Обучение модели\n",
|
|||
|
"model = RandomForestClassifier(n_estimators=100, random_state=42)\n",
|
|||
|
"model.fit(X_train, y_train)\n",
|
|||
|
"\n",
|
|||
|
"# Получение важности признаков\n",
|
|||
|
"importances = model.feature_importances_\n",
|
|||
|
"feature_names = feature_matrix.columns\n",
|
|||
|
"\n",
|
|||
|
"# Сортировка признаков по важности\n",
|
|||
|
"feature_importance = pd.DataFrame({'feature': feature_names, 'importance': importances})\n",
|
|||
|
"feature_importance = feature_importance.sort_values(by='importance', ascending=False)\n",
|
|||
|
"\n",
|
|||
|
"print(\"Feature Importance:\")\n",
|
|||
|
"print(feature_importance)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 31,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки: 156699\n",
|
|||
|
"Размер контрольной выборки: 67157\n",
|
|||
|
"Размер тестовой выборки: 95939\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:1733: UserWarning: index id not found in dataframe, creating new integer column\n",
|
|||
|
" warnings.warn(\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n",
|
|||
|
" warnings.warn(\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n",
|
|||
|
"c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
|
|||
|
" df = pd.concat([df, default_df], sort=True)\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Accuracy: 0.9977068603095739\n",
|
|||
|
"Precision: 0.9982521847690387\n",
|
|||
|
"Recall: 0.9753598438643571\n",
|
|||
|
"F1 Score: 0.9866732477788747\n",
|
|||
|
"ROC AUC: 0.9875985227973351\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"ename": "KeyboardInterrupt",
|
|||
|
"evalue": "",
|
|||
|
"output_type": "error",
|
|||
|
"traceback": [
|
|||
|
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
|||
|
"\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
|
|||
|
"Cell \u001b[1;32mIn[31], line 84\u001b[0m\n\u001b[0;32m 81\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mROC AUC: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mroc_auc\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 83\u001b[0m \u001b[38;5;66;03m# Кросс-валидация\u001b[39;00m\n\u001b[1;32m---> 84\u001b[0m scores \u001b[38;5;241m=\u001b[39m \u001b[43mcross_val_score\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mX_train\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_train\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcv\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m5\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mscoring\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43maccuracy\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m 85\u001b[0m accuracy_cv \u001b[38;5;241m=\u001b[39m scores\u001b[38;5;241m.\u001b[39mmean()\n\u001b[0;32m 86\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCross-validated Accuracy: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00maccuracy_cv\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n",
|
|||
|
"File \u001b[1;32mc:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\sklearn\\utils\\_param_validation.py:213\u001b[0m, in \u001b[0;36mvalidate_params.<locals>.decorator.<locals>.wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 207\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 208\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 209\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 210\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 211\u001b[0m )\n\u001b[0;32m 212\u001b[0m ):\n\u001b[1;32m--> 213\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 214\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m InvalidParameterError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m 215\u001b[0m \u001b[38;5;66;03m# When the function is just a wrapper around an estimator, we allow\u001b[39;00m\n\u001b[0;32m 216\u001b[0m \u001b[38;5;66;03m# the function to delegate validation to the estimator, but we replace\u001b[39;00m\n\u001b[0;32m 217\u001b[0m \u001b[38;5;66;03m# the name of the estimator by the name of the function in the error\u001b[39;00m\n\u001b[0;32m 218\u001b[0m \u001b[38;5;66;03m# message to avoid confusion.\u001b[39;00m\n\u001b[0;32m 219\u001b[0m msg \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msub(\n\u001b[0;32m 220\u001b[0m \u001b[38;5;124mr\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mparameter of \u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124mw+ must be\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 221\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mparameter of \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfunc\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__qualname__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m must be\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 222\u001b[0m \u001b[38;5;28mstr\u001b[39m(e),\n\u001b[0;32m 223\u001b[0m )\n",
|
|||
|
"File \u001b[1;32mc:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\sklearn\\model_selection\\_validation.py:712\u001b[0m, in \u001b[0;36mcross_val_score\u001b[1;34m(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, params, pre_dispatch, error_score)\u001b[0m\n\u001b[0;32m 709\u001b[0m \u001b[38;5;66;03m# To ensure multimetric format is not supported\u001b[39;00m\n\u001b[0;32m 710\u001b[0m scorer \u001b[38;5;241m=\u001b[39m check_scoring(estimator, scoring\u001b[38;5;241m=\u001b[39mscoring)\n\u001b[1;32m--> 712\u001b[0m cv_results \u001b[38;5;241m=\u001b[39m \u001b[43mcross_validate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 713\u001b[0m \u001b[43m \u001b[49m\u001b[43mestimator\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 714\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 715\u001b[0m \u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 716\u001b[0m \u001b[43m \u001b[49m\u001b[43mgroups\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroups\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 717\u001b[0m \u001b[43m \u001b[49m\u001b[43mscoring\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m{\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mscore\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mscorer\u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 718\u001b[0m \u001b[43m \u001b[49m\u001b[43mcv\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcv\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 719\u001b[0m \u001b[43m \u001b[49m\u001b[43mn_jobs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mn_jobs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 720\u001b[0m \u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 721\u001b[0m \u001b[43m \u001b[49m\u001b[43mfit_params\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfit_params\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 722\u001b[0m \u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparams\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 723\u001b[0m \u001b[43m \u001b[49m\u001b[43mpre_dispatch\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpre_dispatch\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 724\u001b[0m \u001b[43m \u001b[49m\u001b[43merror_score\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merror_score\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 725\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 726\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m cv_results[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtest_score\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n",
|
|||
|
"File \u001b[1;32mc:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\sklearn\\utils\\_param_validation.py:213\u001b[0m, in \u001b[0;36mvalidate_params.<locals>.decorator.<locals>.wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 207\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 208\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 209\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 210\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 211\u001b[0m )\n\u001b[0;32m 212\u001b[0m ):\n\u001b[1;32m--> 213\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 214\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m InvalidParameterError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m 215\u001b[0m \u001b[38;5;66;03m# When the function is just a wrapper around an estimator, we allow\u001b[39;00m\n\u001b[0;32m 216\u001b[0m \u001b[38;5;66;03m# the function to delegate validation to the estimator, but we replace\u001b[39;00m\n\u001b[0;32m 217\u001b[0m \u001b[38;5;66;03m# the name of the estimator by the name of the function in the error\u001b[39;00m\n\u001b[0;32m 218\u001b[0m \u001b[38;5;66;03m# message to avoid confusion.\u001b[39;00m\n\u001b[0;32m 219\u001b[0m msg \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msub(\n\u001b[0;32m 220\u001b[0m \u001b[38;5;124mr\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mparameter of \u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124mw+ must be\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 221\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mparameter of \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfunc\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__qualname__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m must be\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 222\u001b[0m \u001b[38;5;28mstr\u001b[39m(e),\n\u001b[0;32m 223\u001b[0m )\n",
|
|||
|
"File \u001b[1;32mc:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\sklearn\\model_selection\\_validation.py:423\u001b[0m, in \u001b[0;36mcross_validate\u001b[1;34m(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, params, pre_dispatch, return_train_score, return_estimator, return_indices, error_score)\u001b[0m\n\u001b[0;32m 420\u001b[0m \u001b[38;5;66;03m# We clone the estimator to make sure that all the folds are\u001b[39;00m\n\u001b[0;32m 421\u001b[0m \u001b[38;5;66;03m# independent, and that it is pickle-able.\u001b[39;00m\n\u001b[0;32m 422\u001b[0m parallel \u001b[38;5;241m=\u001b[39m Parallel(n_jobs\u001b[38;5;241m=\u001b[39mn_jobs, verbose\u001b[38;5;241m=\u001b[39mverbose, pre_dispatch\u001b[38;5;241m=\u001b[39mpre_dispatch)\n\u001b[1;32m--> 423\u001b[0m results \u001b[38;5;241m=\u001b[39m \u001b[43mparallel\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 424\u001b[0m \u001b[43m \u001b[49m\u001b[43mdelayed\u001b[49m\u001b[43m(\u001b[49m\u001b[43m_fit_and_score\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 425\u001b[0m \u001b[43m \u001b[49m\u001b[43mclone\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 426\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 427\u001b[0m \u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 428\u001b[0m \u001b[43m \u001b[49m\u001b[43mscorer\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mscorers\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 429\u001b[0m \u001b[43m \u001b[49m\u001b[43mtrain\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 430\u001b[0m \u001b[43m \u001b[49m\u001b[43mtest\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtest\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 431\u001b[0m \u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 432\u001b[0m \u001b[43m \u001b[49m\u001b[43mparameters\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m 433\u001b[0m \u001b[43m \u001b[49m\u001b[43mfit_params\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrouted_params\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mestimator\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 434\u001b[0m \u001b[43m \u001b[49m\u001b[43mscore_params\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrouted_params\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mscorer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mscore\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 435\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_train_score\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_train_score\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 436\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_times\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m 437\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_estimator\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_estimator\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 438\u001b[0m \u001b[43m \u001b[49m\u001b[43merror_score\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merror_score\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 439\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 440\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mtrain\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mindices\u001b[49m\n\u001b[0;32m 441\u001b[0m \u001b[43m\u001
|
|||
|
"File \u001b[1;32mc:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\sklearn\\utils\\parallel.py:74\u001b[0m, in \u001b[0;36mParallel.__call__\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m 69\u001b[0m config \u001b[38;5;241m=\u001b[39m get_config()\n\u001b[0;32m 70\u001b[0m iterable_with_config \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 71\u001b[0m (_with_config(delayed_func, config), args, kwargs)\n\u001b[0;32m 72\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m delayed_func, args, kwargs \u001b[38;5;129;01min\u001b[39;00m iterable\n\u001b[0;32m 73\u001b[0m )\n\u001b[1;32m---> 74\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__call__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43miterable_with_config\u001b[49m\u001b[43m)\u001b[49m\n",
|
|||
|
"File \u001b[1;32mc:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\joblib\\parallel.py:1918\u001b[0m, in \u001b[0;36mParallel.__call__\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m 1916\u001b[0m output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_get_sequential_output(iterable)\n\u001b[0;32m 1917\u001b[0m \u001b[38;5;28mnext\u001b[39m(output)\n\u001b[1;32m-> 1918\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m output \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreturn_generator \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43moutput\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1920\u001b[0m \u001b[38;5;66;03m# Let's create an ID that uniquely identifies the current call. If the\u001b[39;00m\n\u001b[0;32m 1921\u001b[0m \u001b[38;5;66;03m# call is interrupted early and that the same instance is immediately\u001b[39;00m\n\u001b[0;32m 1922\u001b[0m \u001b[38;5;66;03m# re-used, this id will be used to prevent workers that were\u001b[39;00m\n\u001b[0;32m 1923\u001b[0m \u001b[38;5;66;03m# concurrently finalizing a task from the previous call to run the\u001b[39;00m\n\u001b[0;32m 1924\u001b[0m \u001b[38;5;66;03m# callback.\u001b[39;00m\n\u001b[0;32m 1925\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_lock:\n",
|
|||
|
"File \u001b[1;32mc:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\joblib\\parallel.py:1847\u001b[0m, in \u001b[0;36mParallel._get_sequential_output\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m 1845\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_dispatched_batches \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[0;32m 1846\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_dispatched_tasks \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m-> 1847\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1848\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_completed_tasks \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[0;32m 1849\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprint_progress()\n",
|
|||
|
"File \u001b[1;32mc:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\sklearn\\utils\\parallel.py:136\u001b[0m, in \u001b[0;36m_FuncWrapper.__call__\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 134\u001b[0m config \u001b[38;5;241m=\u001b[39m {}\n\u001b[0;32m 135\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mconfig):\n\u001b[1;32m--> 136\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfunction\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
|||
|
"File \u001b[1;32mc:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\sklearn\\model_selection\\_validation.py:888\u001b[0m, in \u001b[0;36m_fit_and_score\u001b[1;34m(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, score_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, split_progress, candidate_progress, error_score)\u001b[0m\n\u001b[0;32m 886\u001b[0m estimator\u001b[38;5;241m.\u001b[39mfit(X_train, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mfit_params)\n\u001b[0;32m 887\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 888\u001b[0m \u001b[43mestimator\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_train\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_train\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mfit_params\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 890\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m:\n\u001b[0;32m 891\u001b[0m \u001b[38;5;66;03m# Note fit time as time until error\u001b[39;00m\n\u001b[0;32m 892\u001b[0m fit_time \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime() \u001b[38;5;241m-\u001b[39m start_time\n",
|
|||
|
"File \u001b[1;32mc:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\sklearn\\base.py:1473\u001b[0m, in \u001b[0;36m_fit_context.<locals>.decorator.<locals>.wrapper\u001b[1;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1466\u001b[0m estimator\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[0;32m 1468\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 1469\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 1470\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 1471\u001b[0m )\n\u001b[0;32m 1472\u001b[0m ):\n\u001b[1;32m-> 1473\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
|||
|
"File \u001b[1;32mc:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\sklearn\\ensemble\\_forest.py:489\u001b[0m, in \u001b[0;36mBaseForest.fit\u001b[1;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[0;32m 478\u001b[0m trees \u001b[38;5;241m=\u001b[39m [\n\u001b[0;32m 479\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_make_estimator(append\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m, random_state\u001b[38;5;241m=\u001b[39mrandom_state)\n\u001b[0;32m 480\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(n_more_estimators)\n\u001b[0;32m 481\u001b[0m ]\n\u001b[0;32m 483\u001b[0m \u001b[38;5;66;03m# Parallel loop: we prefer the threading backend as the Cython code\u001b[39;00m\n\u001b[0;32m 484\u001b[0m \u001b[38;5;66;03m# for fitting the trees is internally releasing the Python GIL\u001b[39;00m\n\u001b[0;32m 485\u001b[0m \u001b[38;5;66;03m# making threading more efficient than multiprocessing in\u001b[39;00m\n\u001b[0;32m 486\u001b[0m \u001b[38;5;66;03m# that case. However, for joblib 0.12+ we respect any\u001b[39;00m\n\u001b[0;32m 487\u001b[0m \u001b[38;5;66;03m# parallel_backend contexts set at a higher level,\u001b[39;00m\n\u001b[0;32m 488\u001b[0m \u001b[38;5;66;03m# since correctness does not rely on using threads.\u001b[39;00m\n\u001b[1;32m--> 489\u001b[0m trees \u001b[38;5;241m=\u001b[39m \u001b[43mParallel\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 490\u001b[0m \u001b[43m \u001b[49m\u001b[43mn_jobs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mn_jobs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 491\u001b[0m \u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 492\u001b[0m \u001b[43m \u001b[49m\u001b[43mprefer\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mthreads\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 493\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 494\u001b[0m \u001b[43m \u001b[49m\u001b[43mdelayed\u001b[49m\u001b[43m(\u001b[49m\u001b[43m_parallel_build_trees\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 495\u001b[0m \u001b[43m \u001b[49m\u001b[43mt\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 496\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbootstrap\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 497\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 498\u001b[0m \u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 499\u001b[0m \u001b[43m \u001b[49m\u001b[43msample_weight\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 500\u001b[0m \u001b[43m \u001b[49m\u001b[43mi\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 501\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mtrees\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 502\u001b[0m \u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 503\u001b[0m \u001b[43m \u001b[49m\u001b[43mclass_weight\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mclass_weight\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 504\u001b[0m \u001b[43m \u001b[49m\u001b[43mn_samples_bootstrap\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mn_samples_bootstrap\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m
|
|||
|
"File \u001b[1;32mc:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\sklearn\\utils\\parallel.py:74\u001b[0m, in \u001b[0;36mParallel.__call__\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m 69\u001b[0m config \u001b[38;5;241m=\u001b[39m get_config()\n\u001b[0;32m 70\u001b[0m iterable_with_config \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 71\u001b[0m (_with_config(delayed_func, config), args, kwargs)\n\u001b[0;32m 72\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m delayed_func, args, kwargs \u001b[38;5;129;01min\u001b[39;00m iterable\n\u001b[0;32m 73\u001b[0m )\n\u001b[1;32m---> 74\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__call__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43miterable_with_config\u001b[49m\u001b[43m)\u001b[49m\n",
|
|||
|
"File \u001b[1;32mc:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\joblib\\parallel.py:1918\u001b[0m, in \u001b[0;36mParallel.__call__\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m 1916\u001b[0m output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_get_sequential_output(iterable)\n\u001b[0;32m 1917\u001b[0m \u001b[38;5;28mnext\u001b[39m(output)\n\u001b[1;32m-> 1918\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m output \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreturn_generator \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43moutput\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1920\u001b[0m \u001b[38;5;66;03m# Let's create an ID that uniquely identifies the current call. If the\u001b[39;00m\n\u001b[0;32m 1921\u001b[0m \u001b[38;5;66;03m# call is interrupted early and that the same instance is immediately\u001b[39;00m\n\u001b[0;32m 1922\u001b[0m \u001b[38;5;66;03m# re-used, this id will be used to prevent workers that were\u001b[39;00m\n\u001b[0;32m 1923\u001b[0m \u001b[38;5;66;03m# concurrently finalizing a task from the previous call to run the\u001b[39;00m\n\u001b[0;32m 1924\u001b[0m \u001b[38;5;66;03m# callback.\u001b[39;00m\n\u001b[0;32m 1925\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_lock:\n",
|
|||
|
"File \u001b[1;32mc:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\joblib\\parallel.py:1847\u001b[0m, in \u001b[0;36mParallel._get_sequential_output\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m 1845\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_dispatched_batches \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[0;32m 1846\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_dispatched_tasks \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m-> 1847\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1848\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_completed_tasks \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[0;32m 1849\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprint_progress()\n",
|
|||
|
"File \u001b[1;32mc:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\sklearn\\utils\\parallel.py:136\u001b[0m, in \u001b[0;36m_FuncWrapper.__call__\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 134\u001b[0m config \u001b[38;5;241m=\u001b[39m {}\n\u001b[0;32m 135\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mconfig):\n\u001b[1;32m--> 136\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfunction\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
|||
|
"File \u001b[1;32mc:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\sklearn\\ensemble\\_forest.py:192\u001b[0m, in \u001b[0;36m_parallel_build_trees\u001b[1;34m(tree, bootstrap, X, y, sample_weight, tree_idx, n_trees, verbose, class_weight, n_samples_bootstrap, missing_values_in_feature_mask)\u001b[0m\n\u001b[0;32m 189\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m class_weight \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbalanced_subsample\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m 190\u001b[0m curr_sample_weight \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m=\u001b[39m compute_sample_weight(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbalanced\u001b[39m\u001b[38;5;124m\"\u001b[39m, y, indices\u001b[38;5;241m=\u001b[39mindices)\n\u001b[1;32m--> 192\u001b[0m \u001b[43mtree\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fit\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 193\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 194\u001b[0m \u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 195\u001b[0m \u001b[43m \u001b[49m\u001b[43msample_weight\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcurr_sample_weight\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 196\u001b[0m \u001b[43m \u001b[49m\u001b[43mcheck_input\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m 197\u001b[0m \u001b[43m \u001b[49m\u001b[43mmissing_values_in_feature_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmissing_values_in_feature_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 198\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 199\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 200\u001b[0m tree\u001b[38;5;241m.\u001b[39m_fit(\n\u001b[0;32m 201\u001b[0m X,\n\u001b[0;32m 202\u001b[0m y,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 205\u001b[0m missing_values_in_feature_mask\u001b[38;5;241m=\u001b[39mmissing_values_in_feature_mask,\n\u001b[0;32m 206\u001b[0m )\n",
|
|||
|
"File \u001b[1;32mc:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\sklearn\\tree\\_classes.py:472\u001b[0m, in \u001b[0;36mBaseDecisionTree._fit\u001b[1;34m(self, X, y, sample_weight, check_input, missing_values_in_feature_mask)\u001b[0m\n\u001b[0;32m 461\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 462\u001b[0m builder \u001b[38;5;241m=\u001b[39m BestFirstTreeBuilder(\n\u001b[0;32m 463\u001b[0m splitter,\n\u001b[0;32m 464\u001b[0m min_samples_split,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 469\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmin_impurity_decrease,\n\u001b[0;32m 470\u001b[0m )\n\u001b[1;32m--> 472\u001b[0m \u001b[43mbuilder\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbuild\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtree_\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msample_weight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmissing_values_in_feature_mask\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 474\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_outputs_ \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m is_classifier(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m 475\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_classes_ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_classes_[\u001b[38;5;241m0\u001b[39m]\n",
|
|||
|
"\u001b[1;31mKeyboardInterrupt\u001b[0m: "
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"from sklearn.ensemble import RandomForestClassifier\n",
|
|||
|
"from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score\n",
|
|||
|
"from sklearn.model_selection import cross_val_score\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"\n",
|
|||
|
"# Загрузка данных\n",
|
|||
|
"df = pd.read_csv(\"..//static//csv//heart_2020_cleaned.csv\")\n",
|
|||
|
"\n",
|
|||
|
"# Разделение на обучающую и тестовую выборки (например, 70% обучающая, 30% тестовая)\n",
|
|||
|
"train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение обучающей выборки на обучающую и контрольную (например, 70% обучающая, 30% контрольная)\n",
|
|||
|
"train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Вывод размеров выборок\n",
|
|||
|
"print(\"Размер обучающей выборки:\", len(train_df))\n",
|
|||
|
"print(\"Размер контрольной выборки:\", len(val_df))\n",
|
|||
|
"print(\"Размер тестовой выборки:\", len(test_df))\n",
|
|||
|
"\n",
|
|||
|
"# Определение категориальных признаков\n",
|
|||
|
"categorical_features = [\n",
|
|||
|
" 'Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'Sex', 'AgeCategory', 'Race',\n",
|
|||
|
" 'Diabetic', 'PhysicalActivity', 'GenHealth', 'Asthma', 'KidneyDisease', 'SkinCancer'\n",
|
|||
|
"]\n",
|
|||
|
"\n",
|
|||
|
"# Применение one-hot encoding к обучающей выборке\n",
|
|||
|
"train_df_encoded = pd.get_dummies(train_df, columns=categorical_features)\n",
|
|||
|
"\n",
|
|||
|
"# Применение one-hot encoding к контрольной выборке\n",
|
|||
|
"val_df_encoded = pd.get_dummies(val_df, columns=categorical_features)\n",
|
|||
|
"\n",
|
|||
|
"# Применение one-hot encoding к тестовой выборке\n",
|
|||
|
"test_df_encoded = pd.get_dummies(test_df, columns=categorical_features)\n",
|
|||
|
"\n",
|
|||
|
"# Определение сущностей\n",
|
|||
|
"es = ft.EntitySet(id='heart_data')\n",
|
|||
|
"es = es.add_dataframe(dataframe_name='heart', dataframe=train_df_encoded, index='id')\n",
|
|||
|
"\n",
|
|||
|
"# Генерация признаков\n",
|
|||
|
"feature_matrix, feature_defs = ft.dfs(entityset=es, target_dataframe_name='heart', max_depth=2)\n",
|
|||
|
"\n",
|
|||
|
"# Преобразование признаков для контрольной и тестовой выборок\n",
|
|||
|
"val_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=val_df_encoded.index)\n",
|
|||
|
"test_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=test_df_encoded.index)\n",
|
|||
|
"\n",
|
|||
|
"# Удаление строк с NaN\n",
|
|||
|
"feature_matrix = feature_matrix.dropna()\n",
|
|||
|
"val_feature_matrix = val_feature_matrix.dropna()\n",
|
|||
|
"test_feature_matrix = test_feature_matrix.dropna()\n",
|
|||
|
"\n",
|
|||
|
"# Разделение данных на обучающую и тестовую выборки\n",
|
|||
|
"X_train = feature_matrix.drop('HeartDisease', axis=1)\n",
|
|||
|
"y_train = feature_matrix['HeartDisease']\n",
|
|||
|
"X_val = val_feature_matrix.drop('HeartDisease', axis=1)\n",
|
|||
|
"y_val = val_feature_matrix['HeartDisease']\n",
|
|||
|
"X_test = test_feature_matrix.drop('HeartDisease', axis=1)\n",
|
|||
|
"y_test = test_feature_matrix['HeartDisease']\n",
|
|||
|
"\n",
|
|||
|
"# Выбор модели\n",
|
|||
|
"model = RandomForestClassifier(random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Обучение модели\n",
|
|||
|
"model.fit(X_train, y_train)\n",
|
|||
|
"\n",
|
|||
|
"# Предсказание и оценка\n",
|
|||
|
"y_pred = model.predict(X_test)\n",
|
|||
|
"\n",
|
|||
|
"accuracy = accuracy_score(y_test, y_pred)\n",
|
|||
|
"precision = precision_score(y_test, y_pred)\n",
|
|||
|
"recall = recall_score(y_test, y_pred)\n",
|
|||
|
"f1 = f1_score(y_test, y_pred)\n",
|
|||
|
"roc_auc = roc_auc_score(y_test, y_pred)\n",
|
|||
|
"\n",
|
|||
|
"print(f\"Accuracy: {accuracy}\")\n",
|
|||
|
"print(f\"Precision: {precision}\")\n",
|
|||
|
"print(f\"Recall: {recall}\")\n",
|
|||
|
"print(f\"F1 Score: {f1}\")\n",
|
|||
|
"print(f\"ROC AUC: {roc_auc}\")\n",
|
|||
|
"\n",
|
|||
|
"# Кросс-валидация\n",
|
|||
|
"scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')\n",
|
|||
|
"accuracy_cv = scores.mean()\n",
|
|||
|
"print(f\"Cross-validated Accuracy: {accuracy_cv}\")\n",
|
|||
|
"\n",
|
|||
|
"# Анализ важности признаков\n",
|
|||
|
"feature_importances = model.feature_importances_\n",
|
|||
|
"feature_names = X_train.columns\n",
|
|||
|
"\n",
|
|||
|
"importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})\n",
|
|||
|
"importance_df = importance_df.sort_values(by='Importance', ascending=False)\n",
|
|||
|
"\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"sns.barplot(x='Importance', y='Feature', data=importance_df)\n",
|
|||
|
"plt.title('Feature Importance')\n",
|
|||
|
"plt.show()\n",
|
|||
|
"\n",
|
|||
|
"# Проверка на переобучение\n",
|
|||
|
"y_train_pred = model.predict(X_train)\n",
|
|||
|
"\n",
|
|||
|
"accuracy_train = accuracy_score(y_train, y_train_pred)\n",
|
|||
|
"precision_train = precision_score(y_train, y_train_pred)\n",
|
|||
|
"recall_train = recall_score(y_train, y_train_pred)\n",
|
|||
|
"f1_train = f1_score(y_train, y_train_pred)\n",
|
|||
|
"roc_auc_train = roc_auc_score(y_train, y_train_pred)\n",
|
|||
|
"\n",
|
|||
|
"print(f\"Train Accuracy: {accuracy_train}\")\n",
|
|||
|
"print(f\"Train Precision: {precision_train}\")\n",
|
|||
|
"print(f\"Train Recall: {recall_train}\")\n",
|
|||
|
"print(f\"Train F1 Score: {f1_train}\")\n",
|
|||
|
"print(f\"Train ROC AUC: {roc_auc_train}\")\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация результатов\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"plt.scatter(y_test, y_pred, alpha=0.5)\n",
|
|||
|
"plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)\n",
|
|||
|
"plt.xlabel('Actual HeartDisease')\n",
|
|||
|
"plt.ylabel('Predicted HeartDisease')\n",
|
|||
|
"plt.title('Actual vs Predicted HeartDisease')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": "aimenv",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.12.5"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|