{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Набор данных для анализа и прогнозирования сердечного приступа" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Вывод всех столбцов" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['HeartDisease', 'BMI', 'Smoking', 'AlcoholDrinking', 'Stroke',\n", " 'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex', 'AgeCategory',\n", " 'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'SleepTime',\n", " 'Asthma', 'KidneyDisease', 'SkinCancer'],\n", " dtype='object')\n" ] } ], "source": [ "import pandas as pd \n", "df = pd.read_csv(\"..//static//csv//heart_2020_cleaned.csv\")\n", "print(df.columns)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Бизнес-цели: \n", "1. Разработка персонализированных программ профилактики сердечно-сосудистых заболеваний.\n", "Цель технического проекта: создание модели машинного обучения, которая будет прогнозировать риск сердечного приступа для каждого пациента на основе его индивидуальных факторов риска, и разработка онлайн-платформы или приложения для предоставления персонализированных рекомендаций по профилактике.\n", "2. Улучшение качества медицинской \n", "Цель технического проекта: использование данных для выявления групп населения с наибольшим риском сердечного приступа и разработки целевых программ профилактики и раннего выявления заболеваний." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Выполним разбиение на 3 выборки: обучающую, контрольную и тестовую" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Размер обучающей выборки: 156699\n", "Размер контрольной выборки: 67157\n", "Размер тестовой выборки: 95939\n" ] } ], "source": [ "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "\n", "df = pd.read_csv(\"..//static//csv//heart_2020_cleaned.csv\")\n", "\n", "# Разделение на обучающую и тестовую выборки (например, 70% обучающая, 30% тестовая)\n", "train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)\n", "\n", "# Разделение обучающей выборки на обучающую и контрольную (например, 70% обучающая, 30% контрольная)\n", "train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)\n", "\n", "# Вывод размеров выборок\n", "print(\"Размер обучающей выборки:\", len(train_df))\n", "print(\"Размер контрольной выборки:\", len(val_df))\n", "print(\"Размер тестовой выборки:\", len(test_df))" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Распределение классов в HeartDisease:\n", "HeartDisease\n", "No 292422\n", "Yes 27373\n", "Name: count, dtype: int64\n" ] }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAlUAAAHHCAYAAACWQK1nAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABJlUlEQVR4nO3deVgVdf//8dcBZVE8oCIgiUuaO2niEpm4RKJS3ablkne5ZnWjpZSad4b7TeXtlku2fBMrLbVSSxM13JU0KdxSM8NbS0FcACUFhfn90cX8PIKKOoro83Fd57o8M+/5zHuGwzkvZ+YMNsMwDAEAAOCGOBV1AwAAAHcCQhUAAIAFCFUAAAAWIFQBAABYgFAFAABgAUIVAACABQhVAAAAFiBUAQAAWIBQBQAAYAFCFQAAVxETEyObzaaDBw8WdSu4jRGqUCzkvaHlPdzc3FSzZk0NGDBAKSkpRd0ecFcaNWqUbDabjh8/XuD8qlWr6rHHHrvFXf1/M2fOVExMTL7pa9eudXg/cXV1la+vr1q1aqX//Oc/Sk1NvfXN4o5QoqgbAK7FmDFjVK1aNZ07d04bN27Ue++9p++++067du1SqVKliro9ALeRmTNnytvbW7169Spw/ssvv6wmTZooJydHqamp2rx5s0aOHKlJkyZpwYIFatOmjVn77LPPqlu3bnJ1db1F3aM4IlShWGnfvr0aN24sSerXr5/Kly+vSZMmacmSJerevXsRdwfgdvDXX38V6j9ZLVq00FNPPeUwbfv27Wrbtq06d+6sX375RRUrVpQkOTs7y9nZ+ab0izsHp/9QrOX9TzIpKUmSdPLkSb322msKDAyUh4eH7Ha72rdvr+3bt+db9ty5cxo1apRq1qwpNzc3VaxYUZ06ddKBAwckSQcPHnQ4RXDpo1WrVuZYeacT5s+fr3//+9/y8/NT6dKl9cQTT+jw4cP51r1lyxa1a9dOnp6eKlWqlFq2bKlNmzYVuI2tWrUqcP2jRo3KV/vZZ58pKChI7u7uKleunLp161bg+q+0bRfLzc3VlClTVK9ePbm5ucnX11cvvPCCTp065VB3udM8AwYMyDdmQb1PmDAh3z6VpKysLI0cOVI1atSQq6urAgICNHToUGVlZRW4ry7WqlWrfOONHz9eTk5Omjdv3nXtj//+97966KGHVL58ebm7uysoKEhffvllgev/7LPP1LRpU5UqVUply5ZVSEiIVq5c6VCzfPlytWzZUmXKlJHdbleTJk3y9bZw4ULzZ+rt7a1//vOf+vPPPx1qevXq5dBz2bJl1apVK23YsOGq++lGlr0ehX1NLVmyROHh4fL395erq6uqV6+usWPHKicnx6GuVatWql+/vhISEhQSEqJSpUrp3//+t6pWrardu3dr3bp1Bf7OXk6DBg00ZcoUpaWlafr06eb0gq6p2rZtm8LCwuTt7S13d3dVq1ZNffr0uanbu3//fnXu3Fl+fn5yc3NTpUqV1K1bN6WnpzvUFfa9ANbiSBWKtbwAVL58eUnS77//rsWLF+vpp59WtWrVlJKSovfff18tW7bUL7/8In9/f0lSTk6OHnvsMcXFxalbt2565ZVXdPr0aa1atUq7du1S9erVzXV0795dHTp0cFjv8OHDC+xn/PjxstlsGjZsmI4dO6YpU6YoNDRUiYmJcnd3lyStXr1a7du3V1BQkEaOHCknJyfNnj1bbdq00YYNG9S0adN841aqVEnR0dGSpDNnzuill14qcN1vvvmmunTpon79+ik1NVXTpk1TSEiIfv75Z3l5eeVbpn///mrRooUk6euvv9aiRYsc5r/wwguKiYlR79699fLLLyspKUnTp0/Xzz//rE2bNqlkyZIF7odrkZaWZm7bxXJzc/XEE09o48aN6t+/v+rUqaOdO3dq8uTJ+vXXX7V48eJrWs/s2bM1YsQITZw4Uc8880yBNVfbH1OnTtUTTzyhHj16KDs7W1988YWefvppLV26VOHh4Wbd6NGjNWrUKD300EMaM2aMXFxctGXLFq1evVpt27aV9PeHdJ8+fVSvXj0NHz5cXl5e+vnnnxUbG2v2l7fvmzRpoujoaKWkpGjq1KnatGlTvp+pt7e3Jk+eLEn6448/NHXqVHXo0EGHDx8u8Gd/sRtZVvr7PzMFyc3NzTetsK+pmJgYeXh4KDIyUh4eHlq9erWioqKUkZGhCRMmOIx54sQJtW/fXt26ddM///lP8/qogQMHysPDQ2+88YYkydfX96rbIklPPfWU+vbtq5UrV2r8+PEF1hw7dkxt27ZVhQoV9Prrr8vLy0sHDx7U119/fdO2Nzs7W2FhYcrKytLAgQPl5+enP//8U0uXLlVaWpo8PT0lXd97ASxiAMXA7NmzDUnG999/b6SmphqHDx82vvjiC6N8+fKGu7u78ccffxiGYRjnzp0zcnJyHJZNSkoyXF1djTFjxpjTPv74Y0OSMWnSpHzrys3NNZeTZEyYMCFfTb169YyWLVuaz9esWWNIMu655x4jIyPDnL5gwQJDkjF16lRz7Pvuu88ICwsz12MYhvHXX38Z1apVMx599NF863rooYeM+vXrm89TU1MNScbIkSPNaQcPHjScnZ2N8ePHOyy7c+dOo0SJEvmm79+/35BkzJkzx5w2cuRI4+K3hA0bNhiSjLlz5zosGxsbm296lSpVjPDw8Hy9R0REGJe+zVza+9ChQw0fHx8jKCjIYZ9++umnhpOTk7FhwwaH5WfNmmVIMjZt2pRvfRdr2bKlOd6yZcuMEiVKGK+++mqBtYXZH4bx98/pYtnZ2Ub9+vWNNm3aOIzl5ORkPPnkk/lei3k/87S0NKNMmTJGs2bNjLNnzxZYk52dbfj4+Bj169d3qFm6dKkhyYiKijKn9ezZ06hSpYrDOB988IEhydi6dWuB22zFsnn76EqPi18X1/KaunRfG4ZhvPDCC0apUqWMc+fOmdNatmxpSDJmzZqVr/7S39M8eb+vCxcuvOy2NWjQwChbtqz5PO89KCkpyTAMw1i0aJEhyfjxxx8vO4bV2/vzzz9fte9rfS+AtTj9h2IlNDRUFSpUUEBAgLp16yYPDw8tWrRI99xzjyTJ1dVVTk5/v6xzcnJ04sQJeXh4qFatWvrpp5/Mcb766it5e3tr4MCB+dZx6Smfa/Hcc8+pTJky5vOnnnpKFStW1HfffSdJSkxM1P79+/XMM8/oxIkTOn78uI4fP67MzEw98sgjWr9+fb7/3Z87d05ubm5XXO/XX3+t3NxcdenSxRzz+PHj8vPz03333ac1a9Y41GdnZ0vSFS+6XbhwoTw9PfXoo486jBkUFCQPD498Y54/f96h7vjx4zp37twV+/7zzz81bdo0vfnmm/Lw8Mi3/jp16qh27doOY+ad8r10/ZezdetWdenSRZ07d853hCNPYfaHJPNooySdOnVK6enpatGihcNra/HixcrNzVVUVJT5WsyT99patWqVTp8+rddffz3fzzavZtu2bTp27Jj+9a9/OdSEh4erdu3aWrZsmcNyubm55j5KTEzUJ598oooVK6pOnTpX3KYbXVb6+/dp1apV+R6XHhm6ltfUxfv69OnTOn78uFq0aKG//vpLe/fudRjX1dVVvXv3LlSvheXh4aHTp09fdn7e0Z6lS5fq/PnzBdZYvb15R6JWrFihv/76q8B1Xut7AazF6T8UKzNmzFDNmjVVokQJ+fr6qlatWg4fXLm5uZo6dapmzpyppKQkh+sR8k4RSn+fNqxVq5ZKlLD2V+C+++5zeG6z2VSjRg3zOoz9+/dLknr27HnZMdLT01W2bFnz+fHjx/ONe6n9+/fLMIzL1l16mi4tLU2S8gWZS8dMT0+Xj49PgfOPHTvm8HzlypWqUKHCFfu81MiRI+Xv768XXngh37VJ+/fv1549ey475qXrL8iff/6p8PBwZWZm6sSJE5cNzIXZH9LfH6Djxo1TYmKiw3VdF4974MABOTk5qW7dupcdJ++0df369S9b87///U+SVKtWrXzzateurY0bNzpMO3z4sMO+qlixor766qurbtONLitJISEh8vb2zjf90sB4La+p3bt3a8SIEVq9erUyMjIc6i69fuiee+6Ri4tLoXotrDNnzjj8B+lSLVu2VOfOnTV69GhNnjxZrVq1UseOHfXMM8+Y4dzq7a1WrZoiIyM1adIkzZ07Vy1atNATTzyhf/7zn2bgutb3AliLUIVipWnTpua3/wryn//8R2+++ab69OmjsWPHqly5cnJyctKgQYMKvL7jVsvrYcKECWrYsGGBNRd/kGVnZ+vo0aN69NFHrzquzWbT8uXLC/yG0qUfjsnJyZIkPz+/K47p4+OjuXPnFjj/0rDTrFkzjRs3zmHa9OnTtWTJkgKX37Nnj2JiYvTZZ58V+Eafm5urwMBATZo0qcDlAwICLtt7nt9++02NGjXS5MmT9eyzz2rOnDkFBtrC7I8NGzboiSeeUEhIiGbOnKmKFSuqZMmSmj17dr6Ly4uCr6+vPvvsM0l/fwh//PHHateunTZu3KjAwMCbtuy1KOxrKi0tTS1btpTdbteYMWNUvXp1ubm56aefftKwYcPy/S5ffJTHCufPn9evv/56xdBrs9n05Zdf6ocfftC3336rFStWqE+fPpo4caJ++OEHeXh43JTtnThxonr16qUlS5Zo5cqVevnllxUdHa0ffvhBlSpVuub3AliLUIU7ypdffqnWrVvr//7v/xymp6WlOfxPunr16tqyZYvOnz9v6f/c8o5E5TEMQ7/99pvuv/9+c72SZLfbFRoaetXxtm/frvPnz18xSOaNaxiGqlWrppo1a1513F9++UU2m63AoyAXj/n999+refPmhfrQ8vb2zrdNV7qYfPjw4WrYsKG6du162fVv375djzzyyHWfks079err66slS5bo1VdfVYcOHfIFwsLsj6+++kpubm5asWKFw2nC2bNn5+s7NzdXv/zyy2WDc97rYNeuXapRo0aBNVWqVJEk7du3z+F+SXnT8ubncXNzc9j/TzzxhMqVK6fp06fr/fffv+x23eiy16Kwr6m1a9fqxIkT+vrrrxUSEmJOz/uWb2Fd7+vmyy+/1NmzZxUWFnbV2gcffFAPPvigxo8fr3nz5qlHjx764osv1K9fv5u2vYGBgQoMDNSIESO0efNmNW/eXLNmzdK4ceOu+b0A1uKaKtxRnJ2dZRiGw7SFCxfm+wp6586ddfz4cYevTOe5dPlr8cknnzhch/Hll1/q6NGjat++vSQpKChI1atX13//+1+dOXMm3/KX3sl54cKFcnZ2vupdqTt16iRnZ2eNHj06X/+GYejEiRPm8wsXLuirr75S06ZNr/i/1i5duignJ0djx47NN+/ChQvmKbPrER8fryVLluitt9667Adfly5d9Oeff+rDDz/MN+/s2bPKzMy86npq1qxpXtczbdo05ebm6pVXXnGoKez+cHZ2ls1mczilfPDgwXzBsWPHjnJyctKYMWPyHVHJ+9m0bdtWZcqUUXR0dL7rzvJqGjduLB8fH82aNcvhVOPy5cu1Z88eh28bFiQ7O1sXLlwo1O0nrFz2Sgr7mso7wnLxazk7O1szZ868pvWVLl36ml+n27dv16BBg1S2bFlFRERctu7UqVP5ftfyQnTefrN6ezMyMnThwgWHaYGBgXJycjLXeS3vBbAeR6pwR3nsscc0ZswY9e7dWw899JB27typuXPn6t5773Woe+655/TJJ58oMjJSW7duVYsWLZSZmanvv/9e//rXv/SPf/zjutZfrlw5Pfzww+rdu7dSUlI0ZcoU1ahRQ88//7wkycnJSR999JHat2+vevXqqXfv3rrnnnv0559/as2aNbLb7fr222+VmZmpGTNm6N1331XNmjW1du1acx15YWzHjh2Kj49XcHCwqlevrnHjxmn48OE6ePCgOnbsqDJlyigpKUmLFi1S//799dprr+n777/Xm2++qR07dujbb7+94ra0bNlSL7zwgqKjo5WYmKi2bduqZMmS2r9/vxYuXKipU6fmu3FiYa1cuVKPPvroFY/WPfvss1qwYIFefPFFrVmzRs2bN1dOTo727t2rBQsWaMWKFVc9gncxPz8/TZgwQf369dM///lPdejQ4Zr2R3h4uCZNmqR27drpmWee0bFjxzRjxgzVqFFDO3bsMOtq1KihN954Q2PHjlWLFi3UqVMnubq66scff5S/v7+io6Nlt9s1efJk9evXT02aNNEzzzyjsmXLavv27frrr780Z84clSxZUm+//bZ69+6tli1bqnv37uYtFapWrarBgwc79JeZmelwCu/TTz/VuXPn9OSTT15139zIsteisK+phx56SGXLllXPnj318ssvy2az6dNPP73m//AEBQXpvffe07hx41SjRg35+Pg4HPXbsGGDzp07Z36pZdOmTfrmm2/k6empRYsWXfF08Jw5czRz5kw9+eSTql69uk6fPq0PP/xQdrvdvAWL1du7evVqDRgwQE8//bRq1qypCxcu6NNPP5Wzs7M6d+4sSYV+L8BNcqu/bghcj7yvM1/p68uG8fctFV599VWjYsWKhru7u9G8eXMjPj7e4ev1ef766y/jjTfeMKpVq2aULFnS8PPzM5566injwIEDhmFc3y0VPv/8c2P48OGGj4+P4e7uboSHhxv/+9//8i3/888/G506dTLKly9vuLq6GlWqVDG6dOlixMXFOaz7ao+ePXs6jPvVV18ZDz/8sFG6dGmjdOnSRu3atY2IiAhj3759hmEYxsCBA42QkBAjNjY2X08F3ULAMP7+en1QUJDh7u5ulClTxggMDDSGDh1qHDlyxKy51lsq2Gw2IyEhwWF6QT+j7Oxs4+233zbq1atnuLq6GmXLljWCgoKM0aNHG+np6fnWd7XxDMMw2rRpY1SuXNk4ffr0Ne+P//u//zPuu+8+w9XV1ahdu7Yxe/bsy+63jz/+2HjggQfMvlu2bGmsWrXKoeabb74xHnroIcPd3d2w2+1G06ZNjc8//9yhZv78+eY45cqVM3r06GHeQiRPz549HV4XHh4eRqNGjYxPP/30ivvoRpfN2/bU1NQC51/udVGY19SmTZuMBx980HB3dzf8/f2NoUOHGitWrDAkGWvWrDHrWrZsadSrV6/A9ScnJxvh4eFGmTJlDEnm6yHv9zXvUbJkSaNChQpGSEiIMX78eOPYsWP5xrr0lgo//fST0b17d6Ny5cqGq6ur4ePjYzz22GPGtm3bbtr2/v7770afPn2M6tWrG25ubka5cuWM1q1bG99//32+dV7tvQA3h80wbuBcBwBJf18T0bp1ay1cuPC6j95c7ODBg6pWrZqSkpJUtWrVAmtGjRqlgwcPFvgHYwEAtx7XVAEAAFiAa6qA25CHh4d69OhxxQun77//fvPP7gAAih6hCrgNeXt7mxcOX06nTp1uUTcAgMLgmioAAAALcE0VAACABQhVAAAAFuCaqlsoNzdXR44cUZkyZa77zycAAIBbyzAMnT59Wv7+/nJyuvzxKELVLXTkyJFC/RFYAABw+zl8+LAqVap02fmEqluoTJkykv7+odjt9iLuBgAAFEZGRoYCAgLMz/HLIVTdQnmn/Ox2O6EKAIBi5mqX7nChOgAAgAWKNFS99957uv/++80jN8HBwVq+fLk5/9y5c4qIiFD58uXl4eGhzp07KyUlxWGMQ4cOKTw8XKVKlZKPj4+GDBmiCxcuONSsXbtWjRo1kqurq2rUqFHg30qbMWOGqlatKjc3NzVr1kxbt251mF+YXgAAwN2rSENVpUqV9NZbbykhIUHbtm1TmzZt9I9//EO7d++WJA0ePFjffvutFi5cqHXr1unIkSMOd5HOyclReHi4srOztXnzZs2ZM0cxMTGKiooya5KSkhQeHq7WrVsrMTFRgwYNUr9+/bRixQqzZv78+YqMjNTIkSP1008/qUGDBgoLC9OxY8fMmqv1AgAA7nLGbaZs2bLGRx99ZKSlpRklS5Y0Fi5caM7bs2ePIcmIj483DMMwvvvuO8PJyclITk42a9577z3DbrcbWVlZhmEYxtChQ4169eo5rKNr165GWFiY+bxp06ZGRESE+TwnJ8fw9/c3oqOjDcMwCtVLYaSnpxuSjPT09EIvAwAAilZhP79vm2uqcnJy9MUXXygzM1PBwcFKSEjQ+fPnFRoaatbUrl1blStXVnx8vCQpPj5egYGB8vX1NWvCwsKUkZFhHu2Kj493GCOvJm+M7OxsJSQkONQ4OTkpNDTUrClMLwAA4O5W5N/+27lzp4KDg3Xu3Dl5eHho0aJFqlu3rhITE+Xi4iIvLy+Hel9fXyUnJ0uSkpOTHQJV3vy8eVeqycjI0NmzZ3Xq1Cnl5OQUWLN3715zjKv1UpCsrCxlZWWZzzMyMq6yNwAAQHFV5EeqatWqpcTERG3ZskUvvfSSevbsqV9++aWo27JEdHS0PD09zQc3/gQA4M5V5KHKxcVFNWrUUFBQkKKjo9WgQQNNnTpVfn5+ys7OVlpamkN9SkqK/Pz8JEl+fn75voGX9/xqNXa7Xe7u7vL29pazs3OBNRePcbVeCjJ8+HClp6ebj8OHDxdupwAAgGKnyEPVpXJzc5WVlaWgoCCVLFlScXFx5rx9+/bp0KFDCg4OliQFBwdr586dDt/SW7Vqlex2u+rWrWvWXDxGXk3eGC4uLgoKCnKoyc3NVVxcnFlTmF4K4urqat4ught+AgBwh7tFF84X6PXXXzfWrVtnJCUlGTt27DBef/11w2azGStXrjQMwzBefPFFo3Llysbq1auNbdu2GcHBwUZwcLC5/IULF4z69esbbdu2NRITE43Y2FijQoUKxvDhw82a33//3ShVqpQxZMgQY8+ePcaMGTMMZ2dnIzY21qz54osvDFdXVyMmJsb45ZdfjP79+xteXl4O3yq8Wi+Fwbf/AAAofgr7+V2koapPnz5GlSpVDBcXF6NChQrGI488YgYqwzCMs2fPGv/617+MsmXLGqVKlTKefPJJ4+jRow5jHDx40Gjfvr3h7u5ueHt7G6+++qpx/vx5h5o1a9YYDRs2NFxcXIx7773XmD17dr5epk2bZlSuXNlwcXExmjZtavzwww8O8wvTy9UQqgAAKH4K+/ltMwzDKNpjZXePjIwMeXp6Kj09nVOBAAAUE4X9/L7trqkCAAAojghVAAAAFiBUAQAAWKDI76gO6wUN+aSoWwBuOwkTnivqFgDc4ThSBQAAYAFCFQAAgAUIVQAAABYgVAEAAFiAUAUAAGABQhUAAIAFCFUAAAAWIFQBAABYgFAFAABgAUIVAACABQhVAAAAFiBUAQAAWIBQBQAAYAFCFQAAgAUIVQAAABYgVAEAAFiAUAUAAGABQhUAAIAFCFUAAAAWIFQBAABYgFAFAABgAUIVAACABQhVAAAAFiBUAQAAWIBQBQAAYAFCFQAAgAUIVQAAABYgVAEAAFiAUAUAAGABQhUAAIAFCFUAAAAWIFQBAABYgFAFAABgAUIVAACABQhVAAAAFiBUAQAAWIBQBQAAYAFCFQAAgAUIVQAAABYgVAEAAFiAUAUAAGABQhUAAIAFCFUAAAAWIFQBAABYgFAFAABggSINVdHR0WrSpInKlCkjHx8fdezYUfv27XOoadWqlWw2m8PjxRdfdKg5dOiQwsPDVapUKfn4+GjIkCG6cOGCQ83atWvVqFEjubq6qkaNGoqJicnXz4wZM1S1alW5ubmpWbNm2rp1q8P8c+fOKSIiQuXLl5eHh4c6d+6slJQUa3YGAAAo1oo0VK1bt04RERH64YcftGrVKp0/f15t27ZVZmamQ93zzz+vo0ePmo933nnHnJeTk6Pw8HBlZ2dr8+bNmjNnjmJiYhQVFWXWJCUlKTw8XK1bt1ZiYqIGDRqkfv36acWKFWbN/PnzFRkZqZEjR+qnn35SgwYNFBYWpmPHjpk1gwcP1rfffquFCxdq3bp1OnLkiDp16nQT9xAAACgubIZhGEXdRJ7U1FT5+Pho3bp1CgkJkfT3kaqGDRtqypQpBS6zfPlyPfbYYzpy5Ih8fX0lSbNmzdKwYcOUmpoqFxcXDRs2TMuWLdOuXbvM5bp166a0tDTFxsZKkpo1a6YmTZpo+vTpkqTc3FwFBARo4MCBev3115Wenq4KFSpo3rx5euqppyRJe/fuVZ06dRQfH68HH3zwqtuXkZEhT09Ppaeny263X/d+upqgIZ/ctLGB4iphwnNF3QKAYqqwn9+31TVV6enpkqRy5co5TJ87d668vb1Vv359DR8+XH/99Zc5Lz4+XoGBgWagkqSwsDBlZGRo9+7dZk1oaKjDmGFhYYqPj5ckZWdnKyEhwaHGyclJoaGhZk1CQoLOnz/vUFO7dm1VrlzZrAEAAHevEkXdQJ7c3FwNGjRIzZs3V/369c3pzzzzjKpUqSJ/f3/t2LFDw4YN0759+/T1119LkpKTkx0ClSTzeXJy8hVrMjIydPbsWZ06dUo5OTkF1uzdu9ccw8XFRV5eXvlq8tZzqaysLGVlZZnPMzIyCrs7AABAMXPbhKqIiAjt2rVLGzdudJjev39/89+BgYGqWLGiHnnkER04cEDVq1e/1W1ek+joaI0ePbqo2wAAALfAbXH6b8CAAVq6dKnWrFmjSpUqXbG2WbNmkqTffvtNkuTn55fvG3h5z/38/K5YY7fb5e7uLm9vbzk7OxdYc/EY2dnZSktLu2zNpYYPH6709HTzcfjw4StuGwAAKL6KNFQZhqEBAwZo0aJFWr16tapVq3bVZRITEyVJFStWlCQFBwdr586dDt/SW7Vqlex2u+rWrWvWxMXFOYyzatUqBQcHS5JcXFwUFBTkUJObm6u4uDizJigoSCVLlnSo2bdvnw4dOmTWXMrV1VV2u93hAQAA7kxFevovIiJC8+bN05IlS1SmTBnz2iRPT0+5u7vrwIEDmjdvnjp06KDy5ctrx44dGjx4sEJCQnT//fdLktq2bau6devq2Wef1TvvvKPk5GSNGDFCERERcnV1lSS9+OKLmj59uoYOHao+ffpo9erVWrBggZYtW2b2EhkZqZ49e6px48Zq2rSppkyZoszMTPXu3dvsqW/fvoqMjFS5cuVkt9s1cOBABQcHF+qbfwAA4M5WpKHqvffek/T3bRMuNnv2bPXq1UsuLi76/vvvzYATEBCgzp07a8SIEWats7Ozli5dqpdeeknBwcEqXbq0evbsqTFjxpg11apV07JlyzR48GBNnTpVlSpV0kcffaSwsDCzpmvXrkpNTVVUVJSSk5PVsGFDxcbGOly8PnnyZDk5Oalz587KyspSWFiYZs6ceZP2DgAAKE5uq/tU3em4TxVQdLhPFYDrVSzvUwUAAFBcEaoAAAAsQKgCAACwAKEKAADAAoQqAAAACxCqAAAALECoAgAAsAChCgAAwAKEKgAAAAsQqgAAACxAqAIAALAAoQoAAMAChCoAAAALEKoAAAAsQKgCAACwAKEKAADAAoQqAAAACxCqAAAALECoAgAAsAChCgAAwAKEKgAAAAsQqgAAACxAqAIAALAAoQoAAMAChCoAAAALEKoAAAAsQKgCAACwAKEKAADAAoQqAAAACxCqAAAALECoAgAAsAChCgAAwAKEKgAAAAsQqgAAACxAqAIAALAAoQoAAMAChCoAAAALEKoAAAAsQKgCAACwAKEKAADAAoQqAAAACxCqAAAALECoAgAAsAChCgAAwAKEKgAAAAsQqgAAACxAqAIAALAAoQoAAMAChCoAAAALEKoAAAAsUKShKjo6Wk2aNFGZMmXk4+Ojjh07at++fQ41586dU0REhMqXLy8PDw917txZKSkpDjWHDh1SeHi4SpUqJR8fHw0ZMkQXLlxwqFm7dq0aNWokV1dX1ahRQzExMfn6mTFjhqpWrSo3Nzc1a9ZMW7duveZeAADA3alIQ9W6desUERGhH374QatWrdL58+fVtm1bZWZmmjWDBw/Wt99+q4ULF2rdunU6cuSIOnXqZM7PyclReHi4srOztXnzZs2ZM0cxMTGKiooya5KSkhQeHq7WrVsrMTFRgwYNUr9+/bRixQqzZv78+YqMjNTIkSP1008/qUGDBgoLC9OxY8cK3QsAALh72QzDMIq6iTypqany8fHRunXrFBISovT0dFWoUEHz5s3TU089JUnau3ev6tSpo/j4eD344INavny5HnvsMR05ckS+vr6SpFmzZmnYsGFKTU2Vi4uLhg0bpmXLlmnXrl3murp166a0tDTFxsZKkpo1a6YmTZpo+vTpkqTc3FwFBARo4MCBev311wvVy9VkZGTI09NT6enpstvtlu67iwUN+eSmjQ0UVwkTnivqFgAUU4X9/L6trqlKT0+XJJUrV06SlJCQoPPnzys0NNSsqV27tipXrqz4+HhJUnx8vAIDA81AJUlhYWHKyMjQ7t27zZqLx8iryRsjOztbCQkJDjVOTk4KDQ01awrTy6WysrKUkZHh8AAAAHem2yZU5ebmatCgQWrevLnq168vSUpOTpaLi4u8vLwcan19fZWcnGzWXByo8ubnzbtSTUZGhs6ePavjx48rJyenwJqLx7haL5eKjo6Wp6en+QgICCjk3gAAAMXNbROqIiIitGvXLn3xxRdF3Yplhg8frvT0dPNx+PDhom4JAADcJCWKugFJGjBggJYuXar169erUqVK5nQ/Pz9lZ2crLS3N4QhRSkqK/Pz8zJpLv6WX9428i2su/ZZeSkqK7Ha73N3d5ezsLGdn5wJrLh7jar1cytXVVa6urtewJwAAQHFVpEeqDMPQgAEDtGjRIq1evVrVqlVzmB8UFKSSJUsqLi7OnLZv3z4dOnRIwcHBkqTg4GDt3LnT4Vt6q1atkt1uV926dc2ai8fIq8kbw8XFRUFBQQ41ubm5iouLM2sK0wsAALh7FemRqoiICM2bN09LlixRmTJlzGuTPD095e7uLk9PT/Xt21eRkZEqV66c7Ha7Bg4cqODgYPPbdm3btlXdunX17LPP6p133lFycrJGjBihiIgI8yjRiy++qOnTp2vo0KHq06ePVq9erQULFmjZsmVmL5GRkerZs6caN26spk2basqUKcrMzFTv3r3Nnq7WCwAAuHsVaah67733JEmtWrVymD579mz16tVLkjR58mQ5OTmpc+fOysrKUlhYmGbOnGnWOjs7a+nSpXrppZcUHBys0qVLq2fPnhozZoxZU61aNS1btkyDBw/W1KlTValSJX300UcKCwsza7p27arU1FRFRUUpOTlZDRs2VGxsrMPF61frBQAA3L1uq/tU3em4TxVQdLhPFYDrVSzvUwUAAFBcEaoAAAAsQKgCAACwAKEKAADAAoQqAAAACxCqAAAALECoAgAAsAChCgAAwAKEKgAAAAsQqgAAACxAqAIAALAAoQoAAMAChCoAAAALEKoAAAAsQKgCAACwAKEKAADAAoQqAAAACxCqAAAALECoAgAAsAChCgAAwAKEKgAAAAsQqgAAACxAqAIAALAAoQoAAMAChCoAAAALEKoAAAAsQKgCAACwAKEKAADAAoQqAAAACxCqAAAALECoAgAAsAChCgAAwALXFaratGmjtLS0fNMzMjLUpk2bG+0JAACg2LmuULV27VplZ2fnm37u3Dlt2LDhhpsCAAAobkpcS/GOHTvMf//yyy9KTk42n+fk5Cg2Nlb33HOPdd0BAAAUE9cUqho2bCibzSabzVbgaT53d3dNmzbNsuYAAACKi2sKVUlJSTIMQ/fee6+2bt2qChUqmPNcXFzk4+MjZ2dny5sEAAC43V1TqKpSpYokKTc396Y0AwAAUFxdU6i62P79+7VmzRodO3YsX8iKioq64cYAAACKk+sKVR9++KFeeukleXt7y8/PTzabzZxns9kIVQAA4K5zXaFq3LhxGj9+vIYNG2Z1PwAAAMXSdd2n6tSpU3r66aet7gUAAKDYuq5Q9fTTT2vlypVW9wIAAFBsXdfpvxo1aujNN9/UDz/8oMDAQJUsWdJh/ssvv2xJcwAAAMXFdYWqDz74QB4eHlq3bp3WrVvnMM9msxGqAADAXee6QlVSUpLVfQAAABRr13VNFQAAABxd15GqPn36XHH+xx9/fF3NAAAAFFfXFapOnTrl8Pz8+fPatWuX0tLSCvxDywAAAHe66zr9t2jRIofH0qVL9fvvv6tr16568MEHCz3O+vXr9fjjj8vf3182m02LFy92mN+rVy/ZbDaHR7t27RxqTp48qR49eshut8vLy0t9+/bVmTNnHGp27NihFi1ayM3NTQEBAXrnnXfy9bJw4ULVrl1bbm5uCgwM1Hfffecw3zAMRUVFqWLFinJ3d1doaKj2799f6G0FAAB3NsuuqXJyclJkZKQmT55c6GUyMzPVoEEDzZgx47I17dq109GjR83H559/7jC/R48e2r17t1atWqWlS5dq/fr16t+/vzk/IyNDbdu2VZUqVZSQkKAJEyZo1KhR+uCDD8yazZs3q3v37urbt69+/vlndezYUR07dtSuXbvMmnfeeUfvvvuuZs2apS1btqh06dIKCwvTuXPnCr29AADgznXdf1C5IAcOHNCFCxcKXd++fXu1b9/+ijWurq7y8/MrcN6ePXsUGxurH3/8UY0bN5YkTZs2TR06dNB///tf+fv7a+7cucrOztbHH38sFxcX1atXT4mJiZo0aZIZvqZOnap27dppyJAhkqSxY8dq1apVmj59umbNmiXDMDRlyhSNGDFC//jHPyRJn3zyiXx9fbV48WJ169at0NsMAADuTNcVqiIjIx2eG4aho0ePatmyZerZs6cljeVZu3atfHx8VLZsWbVp00bjxo1T+fLlJUnx8fHy8vIyA5UkhYaGysnJSVu2bNGTTz6p+Ph4hYSEyMXFxawJCwvT22+/rVOnTqls2bKKj4/Pt01hYWHm6cikpCQlJycrNDTUnO/p6almzZopPj7+sqEqKytLWVlZ5vOMjIwb3h8AAOD2dF2h6ueff3Z47uTkpAoVKmjixIlX/WbgtWjXrp06deqkatWq6cCBA/r3v/+t9u3bKz4+Xs7OzkpOTpaPj4/DMiVKlFC5cuWUnJwsSUpOTla1atUcanx9fc15ZcuWVXJysjnt4pqLx7h4uYJqChIdHa3Ro0dfx5YDAIDi5rpC1Zo1a6zuo0AXHwEKDAzU/fffr+rVq2vt2rV65JFHbkkPN2L48OEOR8AyMjIUEBBQhB0BAICb5YYuVE9NTdXGjRu1ceNGpaamWtXTZd17773y9vbWb7/9Jkny8/PTsWPHHGouXLigkydPmtdh+fn5KSUlxaEm7/nVai6ef/FyBdUUxNXVVXa73eEBAADuTNcVqjIzM9WnTx9VrFhRISEhCgkJkb+/v/r27au//vrL6h5Nf/zxh06cOKGKFStKkoKDg5WWlqaEhASzZvXq1crNzVWzZs3MmvXr1+v8+fNmzapVq1SrVi2VLVvWrImLi3NY16pVqxQcHCxJqlatmvz8/BxqMjIytGXLFrMGAADc3a4rVEVGRmrdunX69ttvlZaWprS0NC1ZskTr1q3Tq6++Wuhxzpw5o8TERCUmJkr6+4LwxMREHTp0SGfOnNGQIUP0ww8/6ODBg4qLi9M//vEP1ahRQ2FhYZKkOnXqqF27dnr++ee1detWbdq0SQMGDFC3bt3k7+8vSXrmmWfk4uKivn37avfu3Zo/f76mTp3qcFrulVdeUWxsrCZOnKi9e/dq1KhR2rZtmwYMGCDp7z8SPWjQII0bN07ffPONdu7cqeeee07+/v7q2LHj9exCAABwh7EZhmFc60Le3t768ssv1apVK4fpa9asUZcuXQp9KnDt2rVq3bp1vuk9e/bUe++9p44dO+rnn39WWlqa/P391bZtW40dO9bhgvGTJ09qwIAB+vbbb+Xk5KTOnTvr3XfflYeHh1mzY8cORURE6Mcff5S3t7cGDhyoYcOGOaxz4cKFGjFihA4ePKj77rtP77zzjjp06GDONwxDI0eO1AcffKC0tDQ9/PDDmjlzpmrWrFmobZX+Prrl6emp9PT0m3oqMGjIJzdtbKC4SpjwXFG3AKCYKuzn93WFqlKlSikhIUF16tRxmL579241bdpUmZmZ197xXYBQBRQdQhWA61XYz+/rOv0XHByskSNHOtxN/OzZsxo9ejTXGAEAgLvSdd1SYcqUKWrXrp0qVaqkBg0aSJK2b98uV1dXrVy50tIGAQAAioPrClWBgYHav3+/5s6dq71790qSunfvrh49esjd3d3SBgEAAIqD6wpV0dHR8vX11fPPP+8w/eOPP1Zqamq+i8ABAADudNd1TdX777+v2rVr55ter149zZo164abAgAAKG6uK1QlJyebN+C8WIUKFXT06NEbbgoAAKC4ua5QFRAQoE2bNuWbvmnTJvOmmwAAAHeT67qm6vnnn9egQYN0/vx5tWnTRpIUFxenoUOHXtMd1QEAAO4U1xWqhgwZohMnTuhf//qXsrOzJUlubm4aNmyYhg8fbmmDAAAAxcF1hSqbzaa3335bb775pvbs2SN3d3fdd999cnV1tbo/AACAYuG6QlUeDw8PNWnSxKpeAAAAiq3rulAdAAAAjghVAAAAFiBUAQAAWIBQBQAAYAFCFQAAgAUIVQAAABYgVAEAAFiAUAUAAGABQhUAAIAFCFUAAAAWIFQBAABYgFAFAABgAUIVAACABQhVAAAAFiBUAQAAWIBQBQAAYAFCFQAAgAUIVQAAABYgVAEAAFiAUAUAAGABQhUAAIAFCFUAAAAWIFQBAABYgFAFAABgAUIVAACABQhVAAAAFiBUAQAAWIBQBQAAYAFCFQAAgAUIVQAAABYgVAEAAFiAUAUAAGABQhUAAIAFCFUAAAAWIFQBAABYgFAFAABgAUIVAACABQhVAAAAFijSULV+/Xo9/vjj8vf3l81m0+LFix3mG4ahqKgoVaxYUe7u7goNDdX+/fsdak6ePKkePXrIbrfLy8tLffv21ZkzZxxqduzYoRYtWsjNzU0BAQF655138vWycOFC1a5dW25ubgoMDNR33313zb0AAIC7V5GGqszMTDVo0EAzZswocP4777yjd999V7NmzdKWLVtUunRphYWF6dy5c2ZNjx49tHv3bq1atUpLly7V+vXr1b9/f3N+RkaG2rZtqypVqighIUETJkzQqFGj9MEHH5g1mzdvVvfu3dW3b1/9/PPP6tixozp27Khdu3ZdUy8AAODuZTMMwyjqJiTJZrNp0aJF6tixo6S/jwz5+/vr1Vdf1WuvvSZJSk9Pl6+vr2JiYtStWzft2bNHdevW1Y8//qjGjRtLkmJjY9WhQwf98ccf8vf313vvvac33nhDycnJcnFxkSS9/vrrWrx4sfbu3StJ6tq1qzIzM7V06VKznwcffFANGzbUrFmzCtVLYWRkZMjT01Pp6emy2+2W7LeCBA355KaNDRRXCROeK+oWABRThf38vm2vqUpKSlJycrJCQ0PNaZ6enmrWrJni4+MlSfHx8fLy8jIDlSSFhobKyclJW7ZsMWtCQkLMQCVJYWFh2rdvn06dOmXWXLyevJq89RSml4JkZWUpIyPD4QEAAO5Mt22oSk5OliT5+vo6TPf19TXnJScny8fHx2F+iRIlVK5cOYeagsa4eB2Xq7l4/tV6KUh0dLQ8PT3NR0BAwFW2GgAAFFe3bai6EwwfPlzp6enm4/Dhw0XdEgAAuElu21Dl5+cnSUpJSXGYnpKSYs7z8/PTsWPHHOZfuHBBJ0+edKgpaIyL13G5movnX62Xgri6usputzs8AADAnem2DVXVqlWTn5+f4uLizGkZGRnasmWLgoODJUnBwcFKS0tTQkKCWbN69Wrl5uaqWbNmZs369et1/vx5s2bVqlWqVauWypYta9ZcvJ68mrz1FKYXAABwdyvSUHXmzBklJiYqMTFR0t8XhCcmJurQoUOy2WwaNGiQxo0bp2+++UY7d+7Uc889J39/f/MbgnXq1FG7du30/PPPa+vWrdq0aZMGDBigbt26yd/fX5L0zDPPyMXFRX379tXu3bs1f/58TZ06VZGRkWYfr7zyimJjYzVx4kTt3btXo0aN0rZt2zRgwABJKlQvAADg7laiKFe+bds2tW7d2nyeF3R69uypmJgYDR06VJmZmerfv7/S0tL08MMPKzY2Vm5ubuYyc+fO1YABA/TII4/IyclJnTt31rvvvmvO9/T01MqVKxUREaGgoCB5e3srKirK4V5WDz30kObNm6cRI0bo3//+t+677z4tXrxY9evXN2sK0wsAALh73Tb3qbobcJ8qoOhwnyoA16vY36cKAACgOCFUAQAAWIBQBQAAYAFCFQAAgAUIVQAAABYgVAEAAFiAUAUAAGABQhUAAIAFCFUAAAAWIFQBAABYgFAFAABgAUIVAACABQhVAAAAFiBUAQAAWIBQBQAAYAFCFQAAgAUIVQAAABYgVAEAAFiAUAUAAGABQhUAAIAFCFUAAAAWIFQBAABYgFAFAABgAUIVAACABQhVAAAAFiBUAQAAWIBQBQAAYAFCFQAAgAUIVQAAABYgVAEAAFiAUAUAAGABQhUAAIAFCFUAAAAWIFQBAABYgFAFAABgAUIVAACABQhVAAAAFiBUAQAAWIBQBQAAYAFCFQAAgAUIVQAAABYgVAEAAFiAUAUAAGABQhUAAIAFCFUAAAAWIFQBAABYgFAFAABgAUIVAACABQhVAAAAFritQ9WoUaNks9kcHrVr1zbnnzt3ThERESpfvrw8PDzUuXNnpaSkOIxx6NAhhYeHq1SpUvLx8dGQIUN04cIFh5q1a9eqUaNGcnV1VY0aNRQTE5OvlxkzZqhq1apyc3NTs2bNtHXr1puyzQAAoHi6rUOVJNWrV09Hjx41Hxs3bjTnDR48WN9++60WLlyodevW6ciRI+rUqZM5PycnR+Hh4crOztbmzZs1Z84cxcTEKCoqyqxJSkpSeHi4WrdurcTERA0aNEj9+vXTihUrzJr58+crMjJSI0eO1E8//aQGDRooLCxMx44duzU7AQAA3PZshmEYRd3E5YwaNUqLFy9WYmJivnnp6emqUKGC5s2bp6eeekqStHfvXtWpU0fx8fF68MEHtXz5cj322GM6cuSIfH19JUmzZs3SsGHDlJqaKhcXFw0bNkzLli3Trl27zLG7deumtLQ0xcbGSpKaNWumJk2aaPr06ZKk3NxcBQQEaODAgXr99dcLvT0ZGRny9PRUenq67Hb79e6Wqwoa8slNGxsorhImPFfULQAopgr7+X3bH6nav3+//P39de+996pHjx46dOiQJCkhIUHnz59XaGioWVu7dm1VrlxZ8fHxkqT4+HgFBgaagUqSwsLClJGRod27d5s1F4+RV5M3RnZ2thISEhxqnJycFBoaatYAAACUKOoGrqRZs2aKiYlRrVq1dPToUY0ePVotWrTQrl27lJycLBcXF3l5eTks4+vrq+TkZElScnKyQ6DKm58370o1GRkZOnv2rE6dOqWcnJwCa/bu3XvF/rOyspSVlWU+z8jIKPzGAwCAYuW2DlXt27c3/33//ferWbNmqlKlihYsWCB3d/ci7KxwoqOjNXr06KJuAwAA3AK3/em/i3l5ealmzZr67bff5Ofnp+zsbKWlpTnUpKSkyM/PT5Lk5+eX79uAec+vVmO32+Xu7i5vb285OzsXWJM3xuUMHz5c6enp5uPw4cPXvM0AAKB4KFah6syZMzpw4IAqVqyooKAglSxZUnFxceb8ffv26dChQwoODpYkBQcHa+fOnQ7f0lu1apXsdrvq1q1r1lw8Rl5N3hguLi4KCgpyqMnNzVVcXJxZczmurq6y2+0ODwAAcGe6rUPVa6+9pnXr1ungwYPavHmznnzySTk7O6t79+7y9PRU3759FRkZqTVr1ighIUG9e/dWcHCwHnzwQUlS27ZtVbduXT377LPavn27VqxYoREjRigiIkKurq6SpBdffFG///67hg4dqr1792rmzJlasGCBBg8ebPYRGRmpDz/8UHPmzNGePXv00ksvKTMzU7179y6S/QIAAG4/t/U1VX/88Ye6d++uEydOqEKFCnr44Yf1ww8/qEKFCpKkyZMny8nJSZ07d1ZWVpbCwsI0c+ZMc3lnZ2ctXbpUL730koKDg1W6dGn17NlTY8aMMWuqVaumZcuWafDgwZo6daoqVaqkjz76SGFhYWZN165dlZqaqqioKCUnJ6thw4aKjY3Nd/E6AAC4e93W96m603CfKqDocJ8qANfrjrlPFQAAQHFAqAIAALAAoQoAAMAChCoAAAALEKoAAAAsQKgCAACwAKEKAADAAoQqAAAACxCqAAAALECoAgAAsAChCgAAwAKEKgAAAAsQqgAAACxAqAIAALAAoQoAAMAChCoAAAALEKoAAAAsQKgCAACwAKEKAADAAoQqAAAACxCqAAAALFCiqBsAABRe0JBPiroF4LaTMOG5om5BEkeqAAAALEGoAgAAsAChCgAAwAKEKgAAAAsQqgAAACxAqAIAALAAoQoAAMAChCoAAAALEKoAAAAsQKgCAACwAKEKAADAAoQqAAAACxCqAAAALECoAgAAsAChCgAAwAKEKgAAAAsQqgAAACxAqAIAALAAoQoAAMAChCoAAAALEKoAAAAsQKgCAACwAKEKAADAAoQqAAAACxCqAAAALECoAgAAsAChCgAAwAKEqms0Y8YMVa1aVW5ubmrWrJm2bt1a1C0BAIDbAKHqGsyfP1+RkZEaOXKkfvrpJzVo0EBhYWE6duxYUbcGAACKGKHqGkyaNEnPP/+8evfurbp162rWrFkqVaqUPv7446JuDQAAFDFCVSFlZ2crISFBoaGh5jQnJyeFhoYqPj6+CDsDAAC3gxJF3UBxcfz4ceXk5MjX19dhuq+vr/bu3VvgMllZWcrKyjKfp6enS5IyMjJuXqOScrLO3tTxgeLoZv/e3Sr8fgP53ezf77zxDcO4Yh2h6iaKjo7W6NGj800PCAgogm6Au5vntBeLugUAN8mt+v0+ffq0PD09LzufUFVI3t7ecnZ2VkpKisP0lJQU+fn5FbjM8OHDFRkZaT7Pzc3VyZMnVb58edlstpvaL4peRkaGAgICdPjwYdnt9qJuB4CF+P2+uxiGodOnT8vf3/+KdYSqQnJxcVFQUJDi4uLUsWNHSX+HpLi4OA0YMKDAZVxdXeXq6uowzcvL6yZ3ituN3W7nTRe4Q/H7ffe40hGqPISqaxAZGamePXuqcePGatq0qaZMmaLMzEz17t27qFsDAABFjFB1Dbp27arU1FRFRUUpOTlZDRs2VGxsbL6L1wEAwN2HUHWNBgwYcNnTfcDFXF1dNXLkyHyngAEUf/x+oyA242rfDwQAAMBVcfNPAAAACxCqAAAALECoAgAAsAChCgAAwAKEKuAG9OrVSzabTW+99ZbD9MWLF3PXfKAYMgxDoaGhCgsLyzdv5syZ8vLy0h9//FEEnaE4IFQBN8jNzU1vv/22Tp06VdStALhBNptNs2fP1pYtW/T++++b05OSkjR06FBNmzZNlSpVKsIOcTsjVAE3KDQ0VH5+foqOjr5szVdffaV69erJ1dVVVatW1cSJE29hhwCuRUBAgKZOnarXXntNSUlJMgxDffv2Vdu2bfXAAw+offv28vDwkK+vr5599lkdP37cXPbLL79UYGCg3N3dVb58eYWGhiozM7MItwa3EqEKuEHOzs76z3/+o2nTphV4WiAhIUFdunRRt27dtHPnTo0aNUpvvvmmYmJibn2zAAqlZ8+eeuSRR9SnTx9Nnz5du3bt0vvvv682bdrogQce0LZt2xQbG6uUlBR16dJFknT06FF1795dffr00Z49e7R27Vp16tRJ3A7y7sHNP4Eb0KtXL6WlpWnx4sUKDg5W3bp19X//939avHixnnzySRmGoR49eig1NVUrV640lxs6dKiWLVum3bt3F2H3AK7k2LFjqlevnk6ePKmvvvpKu3bt0oYNG7RixQqz5o8//lBAQID27dunM2fOKCgoSAcPHlSVKlWKsHMUFY5UARZ5++23NWfOHO3Zs8dh+p49e9S8eXOHac2bN9f+/fuVk5NzK1sEcA18fHz0wgsvqE6dOurYsaO2b9+uNWvWyMPDw3zUrl1bknTgwAE1aNBAjzzyiAIDA/X000/rww8/5FrLuwyhCrBISEiIwsLCNHz48KJuBYBFSpQooRIl/v4zuWfOnNHjjz+uxMREh8f+/fsVEhIiZ2dnrVq1SsuXL1fdunU1bdo01apVS0lJSUW8FbhV+IPKgIXeeustNWzYULVq1TKn1alTR5s2bXKo27Rpk2rWrClnZ+db3SKA69SoUSN99dVXqlq1qhm0LmWz2dS8eXM1b95cUVFRqlKlihYtWqTIyMhb3C2KAkeqAAsFBgaqR48eevfdd81pr776quLi4jR27Fj9+uuvmjNnjqZPn67XXnutCDsFcK0iIiJ08uRJde/eXT/++KMOHDigFStWqHfv3srJydGWLVv0n//8R9u2bdOhQ4f09ddfKzU1VXXq1Cnq1nGLEKoAi40ZM0a5ubnm80aNGmnBggX64osvVL9+fUVFRWnMmDHq1atX0TUJ4Jr5+/tr06ZNysnJUdu2bRUYGKhBgwbJy8tLTk5OstvtWr9+vTp06KCaNWtqxIgRmjhxotq3b1/UreMW4dt/AAAAFuBIFQAAgAUIVQAAABYgVAEAAFiAUAUAAGABQhUAAIAFCFUAAAAWIFQBAABYgFAFADfB2rVrZbPZlJaWVtStALhFCFUAbku9evVSx44d802/VWFl1KhRatiwYb7pVatWlc1mk81mk7u7u6pWraouXbpo9erVDnUPPfSQjh49Kk9Pz5vaJ4DbB6EKAC5iGIYuXLhwxZoxY8bo6NGj2rdvnz755BN5eXkpNDRU48ePN2tcXFzk5+cnm812s1sGcJsgVAEo1jZu3KgWLVrI3d1dAQEBevnll5WZmWnO//TTT9W4cWOVKVNGfn5+euaZZ3Ts2DFzft6Rr+XLlysoKEiurq767LPPNHr0aG3fvt08KhUTE2MukzdW5cqVFRISog8++EBvvvmmoqKitG/fPodx846o/e9//9Pjjz+usmXLqnTp0qpXr56+++47c8xdu3apffv28vDwkK+vr5599lkdP37cnB8bG6uHH35YXl5eKl++vB577DEdOHDAnJ+dna0BAwaoYsWKcnNzU5UqVRQdHW3OT0tLU79+/VShQgXZ7Xa1adNG27dvt+znAIBQBaAYO3DggNq1a6fOnTtrx44dmj9/vjZu3KgBAwaYNefPn9fYsWO1fft2LV68WAcPHizwj1m//vrreuutt7Rnzx49+uijevXVV1WvXj0dPXpUR48eVdeuXa/YyyuvvCLDMLRkyZIC50dERCgrK0vr16/Xzp079fbbb8vDw0PS34GnTZs2euCBB7Rt2zbFxsYqJSVFXbp0MZfPzMxUZGSktm3bpri4ODk5OenJJ580/3j3u+++q2+++UYLFizQvn37NHfuXFWtWtVc/umnn9axY8e0fPlyJSQkqFGjRnrkkUd08uTJwu5uAFdjAMBtqGfPnoazs7NRunRph4ebm5shyTh16pTRt29fo3///g7LbdiwwXBycjLOnj1b4Lg//vijIck4ffq0YRiGsWbNGkOSsXjxYoe6kSNHGg0aNMi3fJUqVYzJkycXOLavr6/x0ksvOYx76tQpwzAMIzAw0Bg1alSBy40dO9Zo27atw7TDhw8bkox9+/YVuExqaqohydi5c6dhGIYxcOBAo02bNkZubm6+2g0bNhh2u904d+6cw/Tq1asb77//foHjA7h2HKkCcNtq3bq1EhMTHR4fffSROX/79u2KiYmRh4eH+QgLC1Nubq6SkpIkSQkJCXr88cdVuXJllSlTRi1btpQkHTp0yGFdjRs3vuF+DcO47DVUL7/8ssaNG6fmzZtr5MiR2rFjh8N2rFmzxmE7ateuLUnmKb79+/ere/fuuvfee2W3282jUHnb0atXLyUmJqpWrVp6+eWXtXLlSofxz5w5o/LlyzusIykpyeEUIoAbU6KoGwCAyyldurRq1KjhMO2PP/4w/33mzBm98MILevnll/MtW7lyZWVmZiosLExhYWGaO3euKlSooEOHDiksLEzZ2dn51nUjTpw4odTUVFWrVq3A+f369VNYWJiWLVumlStXKjo6WhMnTtTAgQN15swZPf7443r77bfzLVexYkVJ0uOPP64qVaroww8/lL+/v3Jzc1W/fn1zOxo1aqSkpCQtX75c33//vbp06aLQ0FB9+eWXOnPmjCpWrKi1a9fmG9/Ly+uGthvA/0eoAlBsNWrUSL/88ku+4JVn586dOnHihN566y0FBARIkrZt21aosV1cXJSTk1PoXqZOnSonJ6cCbwORJyAgQC+++KJefPFFDR8+XB9++KEGDhyoRo0a6auvvlLVqlVVokT+t+UTJ05o3759+vDDD9WiRQtJf1+gfym73a6uXbuqa9eueuqpp9SuXTudPHlSjRo1UnJyskqUKOFwnRUAa3H6D0CxNWzYMG3evFkDBgxQYmKi9u/fryVLlpgXqleuXFkuLi6aNm2afv/9d33zzTcaO3ZsocauWrWqkpKSlJiYqOPHjysrK8ucd/r0aSUnJ+vw4cNav369+vfvr3Hjxmn8+PGXDXiDBg3SihUrlJSUpJ9++klr1qxRnTp1JP19EfvJkyfVvXt3/fjjjzpw4IBWrFih3r17KycnR2XLllX58uX1wQcf6LffftPq1asVGRnpMP6kSZP0+eefa+/evfr111+1cOFC+fn5mbd7CA4OVseOHbVy5UodPHhQmzdv1htvvFHokAng6ghVAIqt+++/X+vWrdOvv/6qFi1a6IEHHlBUVJT8/f0lSRUqVFBMTIwWLlyounXr6q233tJ///vfQo3duXNntWvXTq1bt1aFChX0+eefm/OioqJUsWJF1ahRQ88++6zS09MVFxenYcOGXXa8nJwcRUREqE6dOmrXrp1q1qypmTNnSpL8/f21adMm5eTkqG3btgoMDNSgQYPk5eUlJycnOTk56YsvvlBCQoLq16+vwYMHa8KECQ7jlylTRu+8844aN26sJk2a6ODBg/ruu+/k5OQkm82m7777TiEhIerdu7dq1qypbt266X//+598fX2vdbcDuAybYRhGUTcBAABQ3HGkCgAAwAKEKgAAAAsQqgAAACxAqAIAALAAoQoAAMAChCoAAAALEKoAAAAsQKgCAACwAKEKAADAAoQqAAAACxCqAAAALECoAgAAsMD/A8EDBhsbeGvMAAAAAElFTkSuQmCC", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Распределение классов в Обучающей выборке:\n", "HeartDisease\n", "No 143331\n", "Yes 13368\n", "Name: count, dtype: int64\n" ] }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Распределение классов в Контрольной выборке:\n", "HeartDisease\n", "No 61442\n", "Yes 5715\n", "Name: count, dtype: int64\n" ] }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Распределение классов в Тестовой выборке:\n", "HeartDisease\n", "No 87649\n", "Yes 8290\n", "Name: count, dtype: int64\n" ] }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "# Проверка распределения классов в целевой переменной\n", "class_distribution = df['HeartDisease'].value_counts()\n", "print(\"Распределение классов в HeartDisease:\")\n", "print(class_distribution)\n", "\n", "# Визуализация распределения классов\n", "sns.countplot(x='HeartDisease', data=df)\n", "plt.title('Распределение классов в HeartDisease')\n", "plt.show()\n", "\n", "# Проверка сбалансированности для каждой выборки\n", "def check_balance(df, title):\n", " class_distribution = df['HeartDisease'].value_counts()\n", " print(f\"Распределение классов в {title}:\")\n", " print(class_distribution)\n", " sns.countplot(x='HeartDisease', data=df)\n", " plt.title(f'Распределение классов в {title}')\n", " plt.show()\n", "\n", "# Проверка сбалансированности для обучающей, контрольной и тестовой выборок\n", "check_balance(train_df, 'Обучающей выборке')\n", "check_balance(val_df, 'Контрольной выборке')\n", "check_balance(test_df, 'Тестовой выборке')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Можно заметить, что данные не сбалансированы - во всех выборках количество значений \"No\" превышает \"Yes\" в среднем в 10 раз. Для балансировки данных будет применен метод upsampling " ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Размер обучающей выборки до upsampling: 156699\n", "Размер контрольной выборки: 67157\n", "Размер тестовой выборки: 95939\n", "\n", "Распределение классов в всем датасете:\n", "Класс No: 292422 (91.44%)\n", "Класс Yes: 27373 (8.56%)\n", "\n", "Распределение классов в Обучающей выборке до upsampling:\n", "Класс No: 143331 (91.47%)\n", "Класс Yes: 13368 (8.53%)\n", "Размер обучающей выборки после upsampling: 286662\n", "\n", "Распределение классов в Обучающей выборке после upsampling:\n", "Класс No: 143331 (50.00%)\n", "Класс Yes: 143331 (50.00%)\n", "\n", "Распределение классов в Контрольной выборке:\n", "Класс No: 61442 (91.49%)\n", "Класс Yes: 5715 (8.51%)\n", "\n", "Распределение классов в Тестовой выборке:\n", "Класс No: 87649 (91.36%)\n", "Класс Yes: 8290 (8.64%)\n" ] } ], "source": [ "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "from imblearn.over_sampling import RandomOverSampler\n", "\n", "# Разделение на обучающую и тестовую выборки (например, 70% обучающая, 30% тестовая)\n", "train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)\n", "\n", "# Разделение обучающей выборки на обучающую и контрольную (например, 70% обучающая, 30% контрольная)\n", "train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)\n", "\n", "# Вывод размеров выборок\n", "print(\"Размер обучающей выборки до upsampling:\", len(train_df))\n", "print(\"Размер контрольной выборки:\", len(val_df))\n", "print(\"Размер тестовой выборки:\", len(test_df))\n", "\n", "# Функция для проверки балансировки данных\n", "def check_balance(df, title):\n", " class_distribution = df['HeartDisease'].value_counts()\n", " print(f\"\\nРаспределение классов в {title}:\")\n", " for cls, count in class_distribution.items():\n", " print(f\"Класс {cls}: {count} ({count / len(df) * 100:.2f}%)\")\n", "\n", "# Проверка балансировки для всего датасета\n", "check_balance(df, 'всем датасете')\n", "\n", "# Проверка балансировки для обучающей выборки до upsampling\n", "check_balance(train_df, 'Обучающей выборке до upsampling')\n", "\n", "# Применение upsampling к обучающей выборке\n", "X_train = train_df.drop('HeartDisease', axis=1) # Отделяем признаки от целевой переменной\n", "y_train = train_df['HeartDisease'] # Целевая переменная\n", "\n", "# Инициализация RandomOverSampler\n", "ros = RandomOverSampler(random_state=42)\n", "\n", "# Применение upsampling\n", "X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)\n", "\n", "# Создание нового DataFrame с балансированными данными\n", "train_df_resampled = pd.concat([X_train_resampled, y_train_resampled], axis=1)\n", "\n", "# Вывод размеров выборок после upsampling\n", "print(\"Размер обучающей выборки после upsampling:\", len(train_df_resampled))\n", "\n", "# Проверка балансировки для обучающей выборки после upsampling\n", "check_balance(train_df_resampled, 'Обучающей выборке после upsampling')\n", "\n", "# Проверка балансировки для контрольной и тестовой выборок (они не должны измениться)\n", "check_balance(val_df, 'Контрольной выборке')\n", "check_balance(test_df, 'Тестовой выборке')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Данные были сбалансированы. Теперь можно перейти к конструированию признаков. Поставлены следующие задачи:\n", "\n", "1. Разработка персонализированных программ профилактики сердечно-сосудистых заболеваний\n", "2. Улучшение качества медицинской помощи" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Унитарное кодирование категориальных признаков (one-hot encoding)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "from imblearn.over_sampling import RandomOverSampler\n", "\n", "# Определение категориальных признаков\n", "categorical_features = [\n", " 'Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'Sex', 'AgeCategory', 'Race',\n", " 'Diabetic', 'PhysicalActivity', 'GenHealth', 'Asthma', 'KidneyDisease', 'SkinCancer'\n", "]\n", "\n", "# Применение one-hot encoding к обучающей выборке\n", "train_df_resampled_encoded = pd.get_dummies(train_df_resampled, columns=categorical_features)\n", "\n", "# Применение one-hot encoding к контрольной выборке\n", "val_df_encoded = pd.get_dummies(val_df, columns=categorical_features)\n", "\n", "# Применение one-hot encoding к тестовой выборке\n", "test_df_encoded = pd.get_dummies(test_df, columns=categorical_features)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Дискретизация числовых признаков" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "from imblearn.over_sampling import RandomOverSampler\n", "\n", "# Загрузка данных\n", "df = pd.read_csv(\"..//static//csv//heart_2020_cleaned.csv\")\n", "\n", "# Разделение на обучающую и тестовую выборки (например, 70% обучающая, 30% тестовая)\n", "train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)\n", "\n", "# Разделение обучающей выборки на обучающую и контрольную (например, 70% обучающая, 30% контрольная)\n", "train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)\n", "\n", "# Применение upsampling к обучающей выборке\n", "X_train = train_df.drop('HeartDisease', axis=1) # Отделяем признаки от целевой переменной\n", "y_train = train_df['HeartDisease'] # Целевая переменная\n", "\n", "# Инициализация RandomOverSampler\n", "ros = RandomOverSampler(random_state=42)\n", "\n", "# Применение upsampling\n", "X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)\n", "\n", "# Создание нового DataFrame с балансированными данными\n", "train_df_resampled = pd.concat([X_train_resampled, y_train_resampled], axis=1)\n", "\n", "# Определение числовых признаков для дискретизации\n", "numerical_features = ['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime']\n", "\n", "# Функция для дискретизации числовых признаков\n", "def discretize_features(df, features, bins=5, labels=False):\n", " for feature in features:\n", " df[f'{feature}_bin'] = pd.cut(df[feature], bins=bins, labels=labels)\n", " return df\n", "\n", "# Применение дискретизации к обучающей, контрольной и тестовой выборкам\n", "train_df_resampled = discretize_features(train_df_resampled, numerical_features)\n", "val_df = discretize_features(val_df, numerical_features)\n", "test_df = discretize_features(test_df, numerical_features)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Ручной синтез. \n", "Создание новых признаков на основе экспертных знаний и логики предметной области. Например, для данных о продаже автомобилей можно создать признак \"возраст автомобиля\" как разницу между текущим годом и годом выпуска." ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "# Загрузка данных\n", "df = pd.read_csv(\"..//static//csv//heart_2020_cleaned.csv\")\n", "\n", "# Разделение на обучающую и тестовую выборки (например, 70% обучающая, 30% тестовая)\n", "train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)\n", "\n", "# Разделение обучающей выборки на обучающую и контрольную (например, 70% обучающая, 30% контрольная)\n", "train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)\n", "\n", "# Применение upsampling к обучающей выборке\n", "X_train = train_df.drop('HeartDisease', axis=1) # Отделяем признаки от целевой переменной\n", "y_train = train_df['HeartDisease'] # Целевая переменная\n", "\n", "# Инициализация RandomOverSampler\n", "ros = RandomOverSampler(random_state=42)\n", "\n", "# Применение upsampling\n", "X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)\n", "\n", "# Создание нового DataFrame с балансированными данными\n", "train_df_resampled = pd.concat([X_train_resampled, y_train_resampled], axis=1)\n", "\n", "# Создание нового признака \"Age\" на основе категориального признака \"AgeCategory\"\n", "age_mapping = {\n", " '18-24': 21,\n", " '25-29': 27,\n", " '30-34': 32,\n", " '35-39': 37,\n", " '40-44': 42,\n", " '45-49': 47,\n", " '50-54': 52,\n", " '55-59': 57,\n", " '60-64': 62,\n", " '65-69': 67,\n", " '70-74': 72,\n", " '75-79': 77,\n", " '80 or older': 80\n", "}\n", "\n", "train_df_resampled['Age'] = train_df_resampled['AgeCategory'].map(age_mapping)\n", "val_df['Age'] = val_df['AgeCategory'].map(age_mapping)\n", "test_df['Age'] = test_df['AgeCategory'].map(age_mapping)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Масштабирование признаков - это процесс преобразования числовых признаков таким образом, чтобы они имели одинаковый масштаб. Это важно для многих алгоритмов машинного обучения, которые чувствительны к масштабу признаков, таких как линейная регрессия, метод опорных векторов (SVM) и нейронные сети." ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "from imblearn.over_sampling import RandomOverSampler\n", "from sklearn.preprocessing import StandardScaler\n", "\n", "# Загрузка данных\n", "df = pd.read_csv(\"..//static//csv//heart_2020_cleaned.csv\")\n", "\n", "# Разделение на обучающую и тестовую выборки (например, 70% обучающая, 30% тестовая)\n", "train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)\n", "\n", "# Разделение обучающей выборки на обучающую и контрольную (например, 70% обучающая, 30% контрольная)\n", "train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)\n", "\n", "# Применение upsampling к обучающей выборке\n", "X_train = train_df.drop('HeartDisease', axis=1) # Отделяем признаки от целевой переменной\n", "y_train = train_df['HeartDisease'] # Целевая переменная\n", "\n", "# Инициализация RandomOverSampler\n", "ros = RandomOverSampler(random_state=42)\n", "\n", "# Применение upsampling\n", "X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)\n", "\n", "# Создание нового DataFrame с балансированными данными\n", "train_df_resampled = pd.concat([X_train_resampled, y_train_resampled], axis=1)\n", "\n", "# Создание нового признака \"Age\" на основе категориального признака \"AgeCategory\"\n", "age_mapping = {\n", " '18-24': 21,\n", " '25-29': 27,\n", " '30-34': 32,\n", " '35-39': 37,\n", " '40-44': 42,\n", " '45-49': 47,\n", " '50-54': 52,\n", " '55-59': 57,\n", " '60-64': 62,\n", " '65-69': 67,\n", " '70-74': 72,\n", " '75-79': 77,\n", " '80 or older': 80\n", "}\n", "\n", "train_df_resampled['Age'] = train_df_resampled['AgeCategory'].map(age_mapping)\n", "val_df['Age'] = val_df['AgeCategory'].map(age_mapping)\n", "test_df['Age'] = test_df['AgeCategory'].map(age_mapping)\n", "\n", "# Определение числовых признаков для масштабирования\n", "numerical_features_to_scale = ['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime', 'Age']\n", "\n", "# Инициализация StandardScaler\n", "scaler = StandardScaler()\n", "\n", "# Масштабирование числовых признаков в обучающей выборке\n", "train_df_resampled[numerical_features_to_scale] = scaler.fit_transform(train_df_resampled[numerical_features_to_scale])\n", "\n", "# Масштабирование числовых признаков в контрольной и тестовой выборках\n", "val_df[numerical_features_to_scale] = scaler.transform(val_df[numerical_features_to_scale])\n", "test_df[numerical_features_to_scale] = scaler.transform(test_df[numerical_features_to_scale])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Конструирование признаков с применением фреймворка Featuretools" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:1733: UserWarning: index id not found in dataframe, creating new integer column\n", " warnings.warn(\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n", " warnings.warn(\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", " df = pd.concat([df, default_df], sort=True)\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", " df = pd.concat([df, default_df], sort=True)\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", " df = pd.concat([df, default_df], sort=True)\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", " df = pd.concat([df, default_df], sort=True)\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", " df = pd.concat([df, default_df], sort=True)\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", " series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", " df = pd.concat([df, default_df], sort=True)\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", " df = pd.concat([df, default_df], sort=True)\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", " df = pd.concat([df, default_df], sort=True)\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", " df = pd.concat([df, default_df], sort=True)\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", " df = pd.concat([df, default_df], sort=True)\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", " series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Обучающая выборка после конструирования признаков:\n", " HeartDisease BMI Smoking AlcoholDrinking Stroke PhysicalHealth \\\n", "id \n", "0 False 32.23 False False False 0.0 \n", "1 False 29.53 False False False 0.0 \n", "2 False 30.13 False False False 0.0 \n", "3 False 35.43 False False False 0.0 \n", "4 False 29.53 False False False 0.0 \n", "\n", " MentalHealth DiffWalking Sex AgeCategory Race Diabetic \\\n", "id \n", "0 0.0 True Male 75-79 White Yes \n", "1 0.0 True Female 50-54 White No \n", "2 0.0 False Male 50-54 White No \n", "3 15.0 False Female 18-24 Hispanic No \n", "4 0.0 True Female 65-69 White Yes \n", "\n", " PhysicalActivity GenHealth SleepTime Asthma KidneyDisease SkinCancer \\\n", "id \n", "0 True Fair 6.0 False True True \n", "1 True Very good 8.0 False False False \n", "2 True Excellent 7.0 False False False \n", "3 True Good 7.0 False False False \n", "4 False Good 10.0 False False False \n", "\n", " Age \n", "id \n", "0 77 \n", "1 52 \n", "2 52 \n", "3 21 \n", "4 67 \n", "Контрольная выборка после конструирования признаков:\n", " HeartDisease BMI Smoking AlcoholDrinking Stroke PhysicalHealth \\\n", "id \n", "80125 False 22.71 False False False 0.0 \n", "116296 False 25.80 False False False 0.0 \n", "18780 False 17.74 True False False 0.0 \n", "233006 NaN NaN \n", "182306 NaN NaN \n", "\n", " MentalHealth DiffWalking Sex AgeCategory Race Diabetic \\\n", "id \n", "80125 0.0 False Male 25-29 White No \n", "116296 0.0 False Male 50-54 Hispanic No \n", "18780 0.0 False Female 65-69 White No \n", "233006 NaN NaN NaN NaN NaN \n", "182306 NaN NaN NaN NaN NaN \n", "\n", " PhysicalActivity GenHealth SleepTime Asthma KidneyDisease \\\n", "id \n", "80125 True Excellent 8.0 False False \n", "116296 True Good 7.0 False False \n", "18780 True Good 7.0 False False \n", "233006 NaN NaN \n", "182306 NaN NaN \n", "\n", " SkinCancer Age \n", "id \n", "80125 False 27 \n", "116296 False 52 \n", "18780 False 67 \n", "233006 \n", "182306 \n", "Тестовая выборка после конструирования признаков:\n", " HeartDisease BMI Smoking AlcoholDrinking Stroke PhysicalHealth \\\n", "id \n", "271884 NaN NaN \n", "270361 NaN NaN \n", "219060 NaN NaN \n", "24010 False 26.5 False False False 14.0 \n", "181930 NaN NaN \n", "\n", " MentalHealth DiffWalking Sex AgeCategory Race Diabetic \\\n", "id \n", "271884 NaN NaN NaN NaN NaN \n", "270361 NaN NaN NaN NaN NaN \n", "219060 NaN NaN NaN NaN NaN \n", "24010 0.0 True Male 75-79 White Yes \n", "181930 NaN NaN NaN NaN NaN \n", "\n", " PhysicalActivity GenHealth SleepTime Asthma KidneyDisease \\\n", "id \n", "271884 NaN NaN \n", "270361 NaN NaN \n", "219060 NaN NaN \n", "24010 True Excellent 9.0 False False \n", "181930 NaN NaN \n", "\n", " SkinCancer Age \n", "id \n", "271884 \n", "270361 \n", "219060 \n", "24010 False 77 \n", "181930 \n" ] } ], "source": [ "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "import featuretools as ft\n", "\n", "# Загрузка данных\n", "df = pd.read_csv(\"..//static//csv//heart_2020_cleaned.csv\")\n", "\n", "# Разделение на обучающую и тестовую выборки (например, 70% обучающая, 30% тестовая)\n", "train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)\n", "\n", "# Разделение обучающей выборки на обучающую и контрольную (например, 70% обучающая, 30% контрольная)\n", "train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)\n", "\n", "# Создание нового признака \"Age\" на основе категориального признака \"AgeCategory\"\n", "age_mapping = {\n", " '18-24': 21,\n", " '25-29': 27,\n", " '30-34': 32,\n", " '35-39': 37,\n", " '40-44': 42,\n", " '45-49': 47,\n", " '50-54': 52,\n", " '55-59': 57,\n", " '60-64': 62,\n", " '65-69': 67,\n", " '70-74': 72,\n", " '75-79': 77,\n", " '80 or older': 80\n", "}\n", "\n", "train_df['Age'] = train_df['AgeCategory'].map(age_mapping)\n", "val_df['Age'] = val_df['AgeCategory'].map(age_mapping)\n", "test_df['Age'] = test_df['AgeCategory'].map(age_mapping)\n", "\n", "# Определение сущностей\n", "es = ft.EntitySet(id='heart_data')\n", "es = es.add_dataframe(dataframe_name='train', dataframe=train_df, index='id')\n", "\n", "# Генерация признаков\n", "feature_matrix, feature_defs = ft.dfs(entityset=es, target_dataframe_name='train', max_depth=2)\n", "\n", "# Преобразование признаков для контрольной и тестовой выборок\n", "val_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=val_df.index)\n", "test_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=test_df.index)\n", "\n", "# Вывод первых нескольких строк для проверки\n", "print(\"Обучающая выборка после конструирования признаков:\")\n", "print(feature_matrix.head())\n", "print(\"Контрольная выборка после конструирования признаков:\")\n", "print(val_feature_matrix.head())\n", "print(\"Тестовая выборка после конструирования признаков:\")\n", "print(test_feature_matrix.head())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Оценка качества каждого набора признаков\n", "\n", "Предсказательная способность\n", "Метрики: RMSE, MAE, R²\n", "\n", "Методы: Обучение модели на обучающей выборке и оценка на контрольной и тестовой выборках.\n", "\n", "Скорость вычисления\n", "Методы: Измерение времени выполнения генерации признаков и обучения модели.\n", "\n", "Надежность\n", "Методы: Кросс-валидация, анализ чувствительности модели к изменениям в данных.\n", "\n", "Корреляция\n", "Методы: Анализ корреляционной матрицы признаков, удаление мультиколлинеарных признаков.\n", "\n", "Цельность\n", "Методы: Проверка логической связи между признаками и целевой переменной, интерпретация результатов модели." ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Размер обучающей выборки: 156699\n", "Размер контрольной выборки: 67157\n", "Размер тестовой выборки: 95939\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:1733: UserWarning: index id not found in dataframe, creating new integer column\n", " warnings.warn(\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n", " warnings.warn(\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", " df = pd.concat([df, default_df], sort=True)\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", " df = pd.concat([df, default_df], sort=True)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Feature Importance:\n", " feature importance\n", "0 HeartDisease 0.851120\n", "1 BMI 0.014203\n", "9 Stroke_No 0.012600\n", "2 PhysicalHealth 0.008628\n", "12 DiffWalking_Yes 0.008111\n", "11 DiffWalking_No 0.007721\n", "36 Diabetic_Yes 0.007583\n", "10 Stroke_Yes 0.007551\n", "4 SleepTime 0.007525\n", "43 GenHealth_Poor 0.006605\n", "27 AgeCategory_80 or older 0.006269\n", "34 Diabetic_No 0.005300\n", "3 MentalHealth 0.005102\n", "41 GenHealth_Fair 0.004277\n", "48 KidneyDisease_Yes 0.003435\n", "47 KidneyDisease_No 0.003086\n", "13 Sex_Female 0.002607\n", "26 AgeCategory_75-79 0.002567\n", "25 AgeCategory_70-74 0.002462\n", "14 Sex_Male 0.002457\n", "6 Smoking_Yes 0.002127\n", "5 Smoking_No 0.001934\n", "42 GenHealth_Good 0.001787\n", "44 GenHealth_Very good 0.001734\n", "33 Race_White 0.001731\n", "50 SkinCancer_Yes 0.001687\n", "38 PhysicalActivity_No 0.001658\n", "39 PhysicalActivity_Yes 0.001585\n", "49 SkinCancer_No 0.001513\n", "40 GenHealth_Excellent 0.001451\n", "24 AgeCategory_65-69 0.001318\n", "46 Asthma_Yes 0.001315\n", "45 Asthma_No 0.001256\n", "23 AgeCategory_60-64 0.001091\n", "30 Race_Black 0.000885\n", "22 AgeCategory_55-59 0.000853\n", "31 Race_Hispanic 0.000825\n", "21 AgeCategory_50-54 0.000715\n", "32 Race_Other 0.000699\n", "7 AlcoholDrinking_No 0.000560\n", "8 AlcoholDrinking_Yes 0.000550\n", "20 AgeCategory_45-49 0.000520\n", "28 Race_American Indian/Alaskan Native 0.000503\n", "35 Diabetic_No, borderline diabetes 0.000479\n", "19 AgeCategory_40-44 0.000444\n", "18 AgeCategory_35-39 0.000412\n", "17 AgeCategory_30-34 0.000267\n", "29 Race_Asian 0.000260\n", "15 AgeCategory_18-24 0.000231\n", "16 AgeCategory_25-29 0.000217\n", "37 Diabetic_Yes (during pregnancy) 0.000184\n" ] } ], "source": [ "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "from imblearn.over_sampling import RandomOverSampler\n", "import featuretools as ft\n", "from sklearn.ensemble import RandomForestClassifier\n", "\n", "# Загрузка данных\n", "df = pd.read_csv(\"..//static//csv//heart_2020_cleaned.csv\")\n", "\n", "# Разделение на обучающую и тестовую выборки (например, 70% обучающая, 30% тестовая)\n", "train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)\n", "\n", "# Разделение обучающей выборки на обучающую и контрольную (например, 70% обучающая, 30% контрольная)\n", "train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)\n", "\n", "# Вывод размеров выборок\n", "print(\"Размер обучающей выборки:\", len(train_df))\n", "print(\"Размер контрольной выборки:\", len(val_df))\n", "print(\"Размер тестовой выборки:\", len(test_df))\n", "\n", "# Определение категориальных признаков\n", "categorical_features = [\n", " 'Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'Sex', 'AgeCategory', 'Race',\n", " 'Diabetic', 'PhysicalActivity', 'GenHealth', 'Asthma', 'KidneyDisease', 'SkinCancer'\n", "]\n", "\n", "# Применение one-hot encoding к обучающей выборке\n", "train_df_encoded = pd.get_dummies(train_df, columns=categorical_features)\n", "\n", "# Применение one-hot encoding к контрольной выборке\n", "val_df_encoded = pd.get_dummies(val_df, columns=categorical_features)\n", "\n", "# Применение one-hot encoding к тестовой выборке\n", "test_df_encoded = pd.get_dummies(test_df, columns=categorical_features)\n", "\n", "# Определение сущностей\n", "es = ft.EntitySet(id='heart_data')\n", "es = es.add_dataframe(dataframe_name='heart', dataframe=train_df_encoded, index='id')\n", "\n", "# Генерация признаков\n", "feature_matrix, feature_defs = ft.dfs(entityset=es, target_dataframe_name='heart', max_depth=2)\n", "\n", "# Преобразование признаков для контрольной и тестовой выборок\n", "val_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=val_df_encoded.index)\n", "test_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=test_df_encoded.index)\n", "\n", "# Оценка важности признаков\n", "X = feature_matrix\n", "y = train_df_encoded['HeartDisease']\n", "\n", "# Разделение данных на обучающую и тестовую выборки\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", "\n", "# Обучение модели\n", "model = RandomForestClassifier(n_estimators=100, random_state=42)\n", "model.fit(X_train, y_train)\n", "\n", "# Получение важности признаков\n", "importances = model.feature_importances_\n", "feature_names = feature_matrix.columns\n", "\n", "# Сортировка признаков по важности\n", "feature_importance = pd.DataFrame({'feature': feature_names, 'importance': importances})\n", "feature_importance = feature_importance.sort_values(by='importance', ascending=False)\n", "\n", "print(\"Feature Importance:\")\n", "print(feature_importance)" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Размер обучающей выборки: 156699\n", "Размер контрольной выборки: 67157\n", "Размер тестовой выборки: 95939\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:1733: UserWarning: index id not found in dataframe, creating new integer column\n", " warnings.warn(\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n", " warnings.warn(\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", " df = pd.concat([df, default_df], sort=True)\n", "c:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", " df = pd.concat([df, default_df], sort=True)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.9977068603095739\n", "Precision: 0.9982521847690387\n", "Recall: 0.9753598438643571\n", "F1 Score: 0.9866732477788747\n", "ROC AUC: 0.9875985227973351\n" ] }, { "ename": "KeyboardInterrupt", "evalue": "", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", "Cell \u001b[1;32mIn[31], line 84\u001b[0m\n\u001b[0;32m 81\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mROC AUC: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mroc_auc\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 83\u001b[0m \u001b[38;5;66;03m# Кросс-валидация\u001b[39;00m\n\u001b[1;32m---> 84\u001b[0m scores \u001b[38;5;241m=\u001b[39m \u001b[43mcross_val_score\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mX_train\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_train\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcv\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m5\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mscoring\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43maccuracy\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m 85\u001b[0m accuracy_cv \u001b[38;5;241m=\u001b[39m scores\u001b[38;5;241m.\u001b[39mmean()\n\u001b[0;32m 86\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCross-validated Accuracy: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00maccuracy_cv\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n", "File \u001b[1;32mc:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\sklearn\\utils\\_param_validation.py:213\u001b[0m, in \u001b[0;36mvalidate_params..decorator..wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 207\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 208\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 209\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 210\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 211\u001b[0m )\n\u001b[0;32m 212\u001b[0m ):\n\u001b[1;32m--> 213\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 214\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m InvalidParameterError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m 215\u001b[0m \u001b[38;5;66;03m# When the function is just a wrapper around an estimator, we allow\u001b[39;00m\n\u001b[0;32m 216\u001b[0m \u001b[38;5;66;03m# the function to delegate validation to the estimator, but we replace\u001b[39;00m\n\u001b[0;32m 217\u001b[0m \u001b[38;5;66;03m# the name of the estimator by the name of the function in the error\u001b[39;00m\n\u001b[0;32m 218\u001b[0m \u001b[38;5;66;03m# message to avoid confusion.\u001b[39;00m\n\u001b[0;32m 219\u001b[0m msg \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msub(\n\u001b[0;32m 220\u001b[0m \u001b[38;5;124mr\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mparameter of \u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124mw+ must be\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 221\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mparameter of \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfunc\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__qualname__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m must be\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 222\u001b[0m \u001b[38;5;28mstr\u001b[39m(e),\n\u001b[0;32m 223\u001b[0m )\n", "File \u001b[1;32mc:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\sklearn\\model_selection\\_validation.py:712\u001b[0m, in \u001b[0;36mcross_val_score\u001b[1;34m(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, params, pre_dispatch, error_score)\u001b[0m\n\u001b[0;32m 709\u001b[0m \u001b[38;5;66;03m# To ensure multimetric format is not supported\u001b[39;00m\n\u001b[0;32m 710\u001b[0m scorer \u001b[38;5;241m=\u001b[39m check_scoring(estimator, scoring\u001b[38;5;241m=\u001b[39mscoring)\n\u001b[1;32m--> 712\u001b[0m cv_results \u001b[38;5;241m=\u001b[39m \u001b[43mcross_validate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 713\u001b[0m \u001b[43m \u001b[49m\u001b[43mestimator\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 714\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 715\u001b[0m \u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 716\u001b[0m \u001b[43m \u001b[49m\u001b[43mgroups\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroups\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 717\u001b[0m \u001b[43m \u001b[49m\u001b[43mscoring\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m{\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mscore\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mscorer\u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 718\u001b[0m \u001b[43m \u001b[49m\u001b[43mcv\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcv\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 719\u001b[0m \u001b[43m \u001b[49m\u001b[43mn_jobs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mn_jobs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 720\u001b[0m \u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 721\u001b[0m \u001b[43m \u001b[49m\u001b[43mfit_params\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfit_params\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 722\u001b[0m \u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparams\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 723\u001b[0m \u001b[43m \u001b[49m\u001b[43mpre_dispatch\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpre_dispatch\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 724\u001b[0m \u001b[43m \u001b[49m\u001b[43merror_score\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merror_score\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 725\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 726\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m cv_results[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtest_score\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n", "File \u001b[1;32mc:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\sklearn\\utils\\_param_validation.py:213\u001b[0m, in \u001b[0;36mvalidate_params..decorator..wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 207\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 208\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 209\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 210\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 211\u001b[0m )\n\u001b[0;32m 212\u001b[0m ):\n\u001b[1;32m--> 213\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 214\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m InvalidParameterError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m 215\u001b[0m \u001b[38;5;66;03m# When the function is just a wrapper around an estimator, we allow\u001b[39;00m\n\u001b[0;32m 216\u001b[0m \u001b[38;5;66;03m# the function to delegate validation to the estimator, but we replace\u001b[39;00m\n\u001b[0;32m 217\u001b[0m \u001b[38;5;66;03m# the name of the estimator by the name of the function in the error\u001b[39;00m\n\u001b[0;32m 218\u001b[0m \u001b[38;5;66;03m# message to avoid confusion.\u001b[39;00m\n\u001b[0;32m 219\u001b[0m msg \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msub(\n\u001b[0;32m 220\u001b[0m \u001b[38;5;124mr\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mparameter of \u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124mw+ must be\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 221\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mparameter of \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfunc\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__qualname__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m must be\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 222\u001b[0m \u001b[38;5;28mstr\u001b[39m(e),\n\u001b[0;32m 223\u001b[0m )\n", "File \u001b[1;32mc:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\sklearn\\model_selection\\_validation.py:423\u001b[0m, in \u001b[0;36mcross_validate\u001b[1;34m(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, params, pre_dispatch, return_train_score, return_estimator, return_indices, error_score)\u001b[0m\n\u001b[0;32m 420\u001b[0m \u001b[38;5;66;03m# We clone the estimator to make sure that all the folds are\u001b[39;00m\n\u001b[0;32m 421\u001b[0m \u001b[38;5;66;03m# independent, and that it is pickle-able.\u001b[39;00m\n\u001b[0;32m 422\u001b[0m parallel \u001b[38;5;241m=\u001b[39m Parallel(n_jobs\u001b[38;5;241m=\u001b[39mn_jobs, verbose\u001b[38;5;241m=\u001b[39mverbose, pre_dispatch\u001b[38;5;241m=\u001b[39mpre_dispatch)\n\u001b[1;32m--> 423\u001b[0m results \u001b[38;5;241m=\u001b[39m \u001b[43mparallel\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 424\u001b[0m \u001b[43m \u001b[49m\u001b[43mdelayed\u001b[49m\u001b[43m(\u001b[49m\u001b[43m_fit_and_score\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 425\u001b[0m \u001b[43m \u001b[49m\u001b[43mclone\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 426\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 427\u001b[0m \u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 428\u001b[0m \u001b[43m \u001b[49m\u001b[43mscorer\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mscorers\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 429\u001b[0m \u001b[43m \u001b[49m\u001b[43mtrain\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 430\u001b[0m \u001b[43m \u001b[49m\u001b[43mtest\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtest\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 431\u001b[0m \u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 432\u001b[0m \u001b[43m \u001b[49m\u001b[43mparameters\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m 433\u001b[0m \u001b[43m \u001b[49m\u001b[43mfit_params\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrouted_params\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mestimator\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 434\u001b[0m \u001b[43m \u001b[49m\u001b[43mscore_params\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrouted_params\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mscorer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mscore\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 435\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_train_score\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_train_score\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 436\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_times\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m 437\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_estimator\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_estimator\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 438\u001b[0m \u001b[43m \u001b[49m\u001b[43merror_score\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merror_score\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 439\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 440\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mtrain\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mindices\u001b[49m\n\u001b[0;32m 441\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 443\u001b[0m _warn_or_raise_about_fit_failures(results, error_score)\n\u001b[0;32m 445\u001b[0m \u001b[38;5;66;03m# For callable scoring, the return type is only know after calling. If the\u001b[39;00m\n\u001b[0;32m 446\u001b[0m \u001b[38;5;66;03m# return type is a dictionary, the error scores can now be inserted with\u001b[39;00m\n\u001b[0;32m 447\u001b[0m \u001b[38;5;66;03m# the correct key.\u001b[39;00m\n", "File \u001b[1;32mc:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\sklearn\\utils\\parallel.py:74\u001b[0m, in \u001b[0;36mParallel.__call__\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m 69\u001b[0m config \u001b[38;5;241m=\u001b[39m get_config()\n\u001b[0;32m 70\u001b[0m iterable_with_config \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 71\u001b[0m (_with_config(delayed_func, config), args, kwargs)\n\u001b[0;32m 72\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m delayed_func, args, kwargs \u001b[38;5;129;01min\u001b[39;00m iterable\n\u001b[0;32m 73\u001b[0m )\n\u001b[1;32m---> 74\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__call__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43miterable_with_config\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[1;32mc:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\joblib\\parallel.py:1918\u001b[0m, in \u001b[0;36mParallel.__call__\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m 1916\u001b[0m output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_get_sequential_output(iterable)\n\u001b[0;32m 1917\u001b[0m \u001b[38;5;28mnext\u001b[39m(output)\n\u001b[1;32m-> 1918\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m output \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreturn_generator \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43moutput\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1920\u001b[0m \u001b[38;5;66;03m# Let's create an ID that uniquely identifies the current call. If the\u001b[39;00m\n\u001b[0;32m 1921\u001b[0m \u001b[38;5;66;03m# call is interrupted early and that the same instance is immediately\u001b[39;00m\n\u001b[0;32m 1922\u001b[0m \u001b[38;5;66;03m# re-used, this id will be used to prevent workers that were\u001b[39;00m\n\u001b[0;32m 1923\u001b[0m \u001b[38;5;66;03m# concurrently finalizing a task from the previous call to run the\u001b[39;00m\n\u001b[0;32m 1924\u001b[0m \u001b[38;5;66;03m# callback.\u001b[39;00m\n\u001b[0;32m 1925\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_lock:\n", "File \u001b[1;32mc:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\joblib\\parallel.py:1847\u001b[0m, in \u001b[0;36mParallel._get_sequential_output\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m 1845\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_dispatched_batches \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[0;32m 1846\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_dispatched_tasks \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m-> 1847\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1848\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_completed_tasks \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[0;32m 1849\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprint_progress()\n", "File \u001b[1;32mc:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\sklearn\\utils\\parallel.py:136\u001b[0m, in \u001b[0;36m_FuncWrapper.__call__\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 134\u001b[0m config \u001b[38;5;241m=\u001b[39m {}\n\u001b[0;32m 135\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mconfig):\n\u001b[1;32m--> 136\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfunction\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[1;32mc:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\sklearn\\model_selection\\_validation.py:888\u001b[0m, in \u001b[0;36m_fit_and_score\u001b[1;34m(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, score_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, split_progress, candidate_progress, error_score)\u001b[0m\n\u001b[0;32m 886\u001b[0m estimator\u001b[38;5;241m.\u001b[39mfit(X_train, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mfit_params)\n\u001b[0;32m 887\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 888\u001b[0m \u001b[43mestimator\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_train\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_train\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mfit_params\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 890\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m:\n\u001b[0;32m 891\u001b[0m \u001b[38;5;66;03m# Note fit time as time until error\u001b[39;00m\n\u001b[0;32m 892\u001b[0m fit_time \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime() \u001b[38;5;241m-\u001b[39m start_time\n", "File \u001b[1;32mc:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\sklearn\\base.py:1473\u001b[0m, in \u001b[0;36m_fit_context..decorator..wrapper\u001b[1;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1466\u001b[0m estimator\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[0;32m 1468\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 1469\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 1470\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 1471\u001b[0m )\n\u001b[0;32m 1472\u001b[0m ):\n\u001b[1;32m-> 1473\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[1;32mc:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\sklearn\\ensemble\\_forest.py:489\u001b[0m, in \u001b[0;36mBaseForest.fit\u001b[1;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[0;32m 478\u001b[0m trees \u001b[38;5;241m=\u001b[39m [\n\u001b[0;32m 479\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_make_estimator(append\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m, random_state\u001b[38;5;241m=\u001b[39mrandom_state)\n\u001b[0;32m 480\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(n_more_estimators)\n\u001b[0;32m 481\u001b[0m ]\n\u001b[0;32m 483\u001b[0m \u001b[38;5;66;03m# Parallel loop: we prefer the threading backend as the Cython code\u001b[39;00m\n\u001b[0;32m 484\u001b[0m \u001b[38;5;66;03m# for fitting the trees is internally releasing the Python GIL\u001b[39;00m\n\u001b[0;32m 485\u001b[0m \u001b[38;5;66;03m# making threading more efficient than multiprocessing in\u001b[39;00m\n\u001b[0;32m 486\u001b[0m \u001b[38;5;66;03m# that case. However, for joblib 0.12+ we respect any\u001b[39;00m\n\u001b[0;32m 487\u001b[0m \u001b[38;5;66;03m# parallel_backend contexts set at a higher level,\u001b[39;00m\n\u001b[0;32m 488\u001b[0m \u001b[38;5;66;03m# since correctness does not rely on using threads.\u001b[39;00m\n\u001b[1;32m--> 489\u001b[0m trees \u001b[38;5;241m=\u001b[39m \u001b[43mParallel\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 490\u001b[0m \u001b[43m \u001b[49m\u001b[43mn_jobs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mn_jobs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 491\u001b[0m \u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 492\u001b[0m \u001b[43m \u001b[49m\u001b[43mprefer\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mthreads\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 493\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 494\u001b[0m \u001b[43m \u001b[49m\u001b[43mdelayed\u001b[49m\u001b[43m(\u001b[49m\u001b[43m_parallel_build_trees\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 495\u001b[0m \u001b[43m \u001b[49m\u001b[43mt\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 496\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbootstrap\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 497\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 498\u001b[0m \u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 499\u001b[0m \u001b[43m \u001b[49m\u001b[43msample_weight\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 500\u001b[0m \u001b[43m \u001b[49m\u001b[43mi\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 501\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mtrees\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 502\u001b[0m \u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 503\u001b[0m \u001b[43m \u001b[49m\u001b[43mclass_weight\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mclass_weight\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 504\u001b[0m \u001b[43m \u001b[49m\u001b[43mn_samples_bootstrap\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mn_samples_bootstrap\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 505\u001b[0m \u001b[43m \u001b[49m\u001b[43mmissing_values_in_feature_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmissing_values_in_feature_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 506\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 507\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mi\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mt\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43menumerate\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mtrees\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 508\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 510\u001b[0m \u001b[38;5;66;03m# Collect newly grown trees\u001b[39;00m\n\u001b[0;32m 511\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mestimators_\u001b[38;5;241m.\u001b[39mextend(trees)\n", "File \u001b[1;32mc:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\sklearn\\utils\\parallel.py:74\u001b[0m, in \u001b[0;36mParallel.__call__\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m 69\u001b[0m config \u001b[38;5;241m=\u001b[39m get_config()\n\u001b[0;32m 70\u001b[0m iterable_with_config \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 71\u001b[0m (_with_config(delayed_func, config), args, kwargs)\n\u001b[0;32m 72\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m delayed_func, args, kwargs \u001b[38;5;129;01min\u001b[39;00m iterable\n\u001b[0;32m 73\u001b[0m )\n\u001b[1;32m---> 74\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__call__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43miterable_with_config\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[1;32mc:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\joblib\\parallel.py:1918\u001b[0m, in \u001b[0;36mParallel.__call__\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m 1916\u001b[0m output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_get_sequential_output(iterable)\n\u001b[0;32m 1917\u001b[0m \u001b[38;5;28mnext\u001b[39m(output)\n\u001b[1;32m-> 1918\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m output \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreturn_generator \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43moutput\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1920\u001b[0m \u001b[38;5;66;03m# Let's create an ID that uniquely identifies the current call. If the\u001b[39;00m\n\u001b[0;32m 1921\u001b[0m \u001b[38;5;66;03m# call is interrupted early and that the same instance is immediately\u001b[39;00m\n\u001b[0;32m 1922\u001b[0m \u001b[38;5;66;03m# re-used, this id will be used to prevent workers that were\u001b[39;00m\n\u001b[0;32m 1923\u001b[0m \u001b[38;5;66;03m# concurrently finalizing a task from the previous call to run the\u001b[39;00m\n\u001b[0;32m 1924\u001b[0m \u001b[38;5;66;03m# callback.\u001b[39;00m\n\u001b[0;32m 1925\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_lock:\n", "File \u001b[1;32mc:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\joblib\\parallel.py:1847\u001b[0m, in \u001b[0;36mParallel._get_sequential_output\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m 1845\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_dispatched_batches \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[0;32m 1846\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_dispatched_tasks \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m-> 1847\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1848\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_completed_tasks \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[0;32m 1849\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprint_progress()\n", "File \u001b[1;32mc:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\sklearn\\utils\\parallel.py:136\u001b[0m, in \u001b[0;36m_FuncWrapper.__call__\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 134\u001b[0m config \u001b[38;5;241m=\u001b[39m {}\n\u001b[0;32m 135\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mconfig):\n\u001b[1;32m--> 136\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfunction\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[1;32mc:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\sklearn\\ensemble\\_forest.py:192\u001b[0m, in \u001b[0;36m_parallel_build_trees\u001b[1;34m(tree, bootstrap, X, y, sample_weight, tree_idx, n_trees, verbose, class_weight, n_samples_bootstrap, missing_values_in_feature_mask)\u001b[0m\n\u001b[0;32m 189\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m class_weight \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbalanced_subsample\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m 190\u001b[0m curr_sample_weight \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m=\u001b[39m compute_sample_weight(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbalanced\u001b[39m\u001b[38;5;124m\"\u001b[39m, y, indices\u001b[38;5;241m=\u001b[39mindices)\n\u001b[1;32m--> 192\u001b[0m \u001b[43mtree\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fit\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 193\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 194\u001b[0m \u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 195\u001b[0m \u001b[43m \u001b[49m\u001b[43msample_weight\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcurr_sample_weight\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 196\u001b[0m \u001b[43m \u001b[49m\u001b[43mcheck_input\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m 197\u001b[0m \u001b[43m \u001b[49m\u001b[43mmissing_values_in_feature_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmissing_values_in_feature_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 198\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 199\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 200\u001b[0m tree\u001b[38;5;241m.\u001b[39m_fit(\n\u001b[0;32m 201\u001b[0m X,\n\u001b[0;32m 202\u001b[0m y,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 205\u001b[0m missing_values_in_feature_mask\u001b[38;5;241m=\u001b[39mmissing_values_in_feature_mask,\n\u001b[0;32m 206\u001b[0m )\n", "File \u001b[1;32mc:\\storage\\university\\3 course\\AIM\\AIM-PIbd-32-Chubykina-P-P\\aimenv\\Lib\\site-packages\\sklearn\\tree\\_classes.py:472\u001b[0m, in \u001b[0;36mBaseDecisionTree._fit\u001b[1;34m(self, X, y, sample_weight, check_input, missing_values_in_feature_mask)\u001b[0m\n\u001b[0;32m 461\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 462\u001b[0m builder \u001b[38;5;241m=\u001b[39m BestFirstTreeBuilder(\n\u001b[0;32m 463\u001b[0m splitter,\n\u001b[0;32m 464\u001b[0m min_samples_split,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 469\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmin_impurity_decrease,\n\u001b[0;32m 470\u001b[0m )\n\u001b[1;32m--> 472\u001b[0m \u001b[43mbuilder\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbuild\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtree_\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msample_weight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmissing_values_in_feature_mask\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 474\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_outputs_ \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m is_classifier(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m 475\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_classes_ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_classes_[\u001b[38;5;241m0\u001b[39m]\n", "\u001b[1;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [ "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score\n", "from sklearn.model_selection import cross_val_score\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "# Загрузка данных\n", "df = pd.read_csv(\"..//static//csv//heart_2020_cleaned.csv\")\n", "\n", "# Разделение на обучающую и тестовую выборки (например, 70% обучающая, 30% тестовая)\n", "train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)\n", "\n", "# Разделение обучающей выборки на обучающую и контрольную (например, 70% обучающая, 30% контрольная)\n", "train_df, val_df = train_test_split(train_df, test_size=0.3, random_state=42)\n", "\n", "# Вывод размеров выборок\n", "print(\"Размер обучающей выборки:\", len(train_df))\n", "print(\"Размер контрольной выборки:\", len(val_df))\n", "print(\"Размер тестовой выборки:\", len(test_df))\n", "\n", "# Определение категориальных признаков\n", "categorical_features = [\n", " 'Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'Sex', 'AgeCategory', 'Race',\n", " 'Diabetic', 'PhysicalActivity', 'GenHealth', 'Asthma', 'KidneyDisease', 'SkinCancer'\n", "]\n", "\n", "# Применение one-hot encoding к обучающей выборке\n", "train_df_encoded = pd.get_dummies(train_df, columns=categorical_features)\n", "\n", "# Применение one-hot encoding к контрольной выборке\n", "val_df_encoded = pd.get_dummies(val_df, columns=categorical_features)\n", "\n", "# Применение one-hot encoding к тестовой выборке\n", "test_df_encoded = pd.get_dummies(test_df, columns=categorical_features)\n", "\n", "# Определение сущностей\n", "es = ft.EntitySet(id='heart_data')\n", "es = es.add_dataframe(dataframe_name='heart', dataframe=train_df_encoded, index='id')\n", "\n", "# Генерация признаков\n", "feature_matrix, feature_defs = ft.dfs(entityset=es, target_dataframe_name='heart', max_depth=2)\n", "\n", "# Преобразование признаков для контрольной и тестовой выборок\n", "val_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=val_df_encoded.index)\n", "test_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=test_df_encoded.index)\n", "\n", "# Удаление строк с NaN\n", "feature_matrix = feature_matrix.dropna()\n", "val_feature_matrix = val_feature_matrix.dropna()\n", "test_feature_matrix = test_feature_matrix.dropna()\n", "\n", "# Разделение данных на обучающую и тестовую выборки\n", "X_train = feature_matrix.drop('HeartDisease', axis=1)\n", "y_train = feature_matrix['HeartDisease']\n", "X_val = val_feature_matrix.drop('HeartDisease', axis=1)\n", "y_val = val_feature_matrix['HeartDisease']\n", "X_test = test_feature_matrix.drop('HeartDisease', axis=1)\n", "y_test = test_feature_matrix['HeartDisease']\n", "\n", "# Выбор модели\n", "model = RandomForestClassifier(random_state=42)\n", "\n", "# Обучение модели\n", "model.fit(X_train, y_train)\n", "\n", "# Предсказание и оценка\n", "y_pred = model.predict(X_test)\n", "\n", "accuracy = accuracy_score(y_test, y_pred)\n", "precision = precision_score(y_test, y_pred)\n", "recall = recall_score(y_test, y_pred)\n", "f1 = f1_score(y_test, y_pred)\n", "roc_auc = roc_auc_score(y_test, y_pred)\n", "\n", "print(f\"Accuracy: {accuracy}\")\n", "print(f\"Precision: {precision}\")\n", "print(f\"Recall: {recall}\")\n", "print(f\"F1 Score: {f1}\")\n", "print(f\"ROC AUC: {roc_auc}\")\n", "\n", "# Кросс-валидация\n", "scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')\n", "accuracy_cv = scores.mean()\n", "print(f\"Cross-validated Accuracy: {accuracy_cv}\")\n", "\n", "# Анализ важности признаков\n", "feature_importances = model.feature_importances_\n", "feature_names = X_train.columns\n", "\n", "importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})\n", "importance_df = importance_df.sort_values(by='Importance', ascending=False)\n", "\n", "plt.figure(figsize=(10, 6))\n", "sns.barplot(x='Importance', y='Feature', data=importance_df)\n", "plt.title('Feature Importance')\n", "plt.show()\n", "\n", "# Проверка на переобучение\n", "y_train_pred = model.predict(X_train)\n", "\n", "accuracy_train = accuracy_score(y_train, y_train_pred)\n", "precision_train = precision_score(y_train, y_train_pred)\n", "recall_train = recall_score(y_train, y_train_pred)\n", "f1_train = f1_score(y_train, y_train_pred)\n", "roc_auc_train = roc_auc_score(y_train, y_train_pred)\n", "\n", "print(f\"Train Accuracy: {accuracy_train}\")\n", "print(f\"Train Precision: {precision_train}\")\n", "print(f\"Train Recall: {recall_train}\")\n", "print(f\"Train F1 Score: {f1_train}\")\n", "print(f\"Train ROC AUC: {roc_auc_train}\")\n", "\n", "# Визуализация результатов\n", "plt.figure(figsize=(10, 6))\n", "plt.scatter(y_test, y_pred, alpha=0.5)\n", "plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)\n", "plt.xlabel('Actual HeartDisease')\n", "plt.ylabel('Predicted HeartDisease')\n", "plt.title('Actual vs Predicted HeartDisease')\n", "plt.show()" ] } ], "metadata": { "kernelspec": { "display_name": "aimenv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.5" } }, "nbformat": 4, "nbformat_minor": 2 }