2024-11-29 21:18:24 +04:00
{
"cells": [
2024-11-30 09:21:24 +04:00
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Написал лабу, не сохранил, а коммиту видимо оказалось лень сохранять. Переписываю по новой (плачу)\n",
"Бизнес-цели:\n",
"Определить, какие факторы влияют на величину страховых взносов (Charges) — помощь в разработке персонализированных страховых предложений.\n",
"Предсказать вероятность того, что человек является курильщиком (Smoker) — для выявления групп риска и оптимизации страховых тарифов.\n",
"Технические цели:\n",
"Построить модель регрессии для предсказания величины взносов на основе других признаков.\n",
"Построить модель классификации для определения вероятности курения на основе доступных данных."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Регрессия:\n",
"Обучающая выборка: (1663, 6), Контрольная: (554, 6), Тестовая: (555, 6)\n",
"Классификация:\n",
"Обучающая выборка: (1663, 5), Контрольная: (554, 5), Тестовая: (555, 5)\n"
]
}
],
"source": [
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"# Загрузка датасета\n",
"df = pd.read_csv(\"..//datasets//Lab_1//Medical_insurance.csv\", sep=\",\")\n",
"\n",
"# Удаление пропусков и выбор целевых данных для бизнес-целей\n",
"columns_to_use = [\"age\", \"sex\", \"bmi\", \"children\", \"smoker\", \"region\", \"charges\"]\n",
"filtered_data = df[columns_to_use].dropna()\n",
"\n",
"# Цель 1: Регрессия (Charges)\n",
"X_reg = filtered_data.drop(columns=[\"charges\"])\n",
"y_reg = filtered_data[\"charges\"]\n",
"\n",
"# Цель 2: Классификация (Smoker)\n",
"X_clf = filtered_data.drop(columns=[\"smoker\", \"charges\"])\n",
"y_clf = filtered_data[\"smoker\"]\n",
"\n",
"# Разбиение данных для регрессии\n",
"X_reg_train, X_reg_temp, y_reg_train, y_reg_temp = train_test_split(X_reg, y_reg, test_size=0.4, random_state=42)\n",
"X_reg_val, X_reg_test, y_reg_val, y_reg_test = train_test_split(X_reg_temp, y_reg_temp, test_size=0.5, random_state=42)\n",
"\n",
"# Разбиение данных для классификации\n",
"X_clf_train, X_clf_temp, y_clf_train, y_clf_temp = train_test_split(X_clf, y_clf, test_size=0.4, random_state=42, stratify=y_clf)\n",
"X_clf_val, X_clf_test, y_clf_val, y_clf_test = train_test_split(X_clf_temp, y_clf_temp, test_size=0.5, random_state=42, stratify=y_clf_temp)\n",
"\n",
"# Проверка размера выборок\n",
"print(\"Регрессия:\")\n",
"print(f\"Обучающая выборка: {X_reg_train.shape}, Контрольная: {X_reg_val.shape}, Тестовая: {X_reg_test.shape}\")\n",
"print(\"Классификация:\")\n",
"print(f\"Обучающая выборка: {X_clf_train.shape}, Контрольная: {X_clf_val.shape}, Тестовая: {X_clf_test.shape}\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Оцениваем сбалансированность"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение классов для классификации:\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAkQAAAHHCAYAAABeLEexAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABFLklEQVR4nO3deVxUdf///+egsggyuIIkKi6puJVLRpp5CYqpmWmaRrl+tFxTK9OrXMswvVLTTNPKJbW8stRWE7HUK0kNc19y3wiQEMYlEeH8/vDHfB3BjYABz+N+u83t5rzf7znndWZhnp7zPmcshmEYAgAAMDEXZxcAAADgbAQiAABgegQiAABgegQiAABgegQiAABgegQiAABgegQiAABgegQiAABgegQiFCoXL17UqVOndO7cOWeXglzE6wrA2QhEKPC++OILhYSEqESJEvLy8lLFihU1ZcoUZ5eFf+hef123bt0qV1dXnThxwtmloJDZt2+fihYtqj179ji7FFMhECFf7d27V88995zuu+8+ubm5yd/fX+Hh4dq7d2+240eNGqWuXbuqRIkSmj9/viIjI7Vu3ToNHDgwnytHbjLD6/r666+re/fuqlSpkr2tRYsWqlOnTpaxqampCgkJUZEiRfTFF1/kZ5lOMXjwYFksFmeXUWAFBQWpXbt2Gjt2rLNLMRcDyCdffvml4erqavj5+Rmvv/668dFHHxlvvPGGUb58ecPV1dX46quvHMb//PPPhiQjIiLCSRUjL5jhdf39998NScbmzZsd2h977DGjdu3aDm3p6elG586dDUnG+++/n59lOs2gQYMMvn5u7fvvvzckGYcPH3Z2KabBOxL54vDhw0bx4sWNmjVrGgkJCQ59Z8+eNWrWrGl4enoaR44csbe3b9/eeOSRR/K7VOQxM7yuQ4cONSpWrGhkZGQ4tGcXiF544QVDkjFmzJj8LNGpCES3d+XKFaNkyZKmel84G4fMkC+mTp2qS5cuad68eSpbtqxDX5kyZfThhx/q4sWLDnNIfv31V9WpU0fdunVTqVKl5OHhocaNG2vVqlX2MRcuXJCnp6deeumlLOs8ffq0ihQpooiICElSr169VLly5SzjLBaLxo8fb79/4sQJDRw4UDVq1JCHh4dKly6tLl266Pjx4w6P+/nnn2WxWPTzzz/b27Zt26ZWrVqpRIkS8vT0VIsWLbRp0yaHxy1cuFAWi0W//fabvS0xMTFLHZLUvn37LDVv2rRJXbp0UcWKFeXm5qaAgAANHz5cf//9d5ZtW7FihRo1aqQSJUrIYrHYb//5z3+yjM2uxsxb8eLFVbduXX300UcO43r16iUvL69bLuvG7bqT1zVTQkKC+vbtK19fX7m7u6t+/fpatGiRw5jjx4/bt2n69OmqVKmSPDw89Nhjj2WZg5Hde2DJkiVycXHR5MmT7W27du1Sr169VKVKFbm7u8vPz099+vTRX3/9dcttzbRq1Sq1bNnytoeFxo4dqw8//FD9+/fXxIkTsx0zfvx4h9ci89arVy/7mDt9z0pScnKyhg8frsqVK8vNzU0VKlRQjx49lJiYaB9z+fJljR8/Xvfff7/c3d1Vvnx5derUSUeOHJH0/57zhQsXOix70KBBWWqTpB9//FH333+/vLy8NHToUBmGIenaZ6hq1ary9vbWiBEjlJ6ebn9Mdp8vSWrXrl2W91Tmc3S9n376SW5ubnrxxRfv+nnK7jMq3fxzml3b1KlTZbFY1KJFC4f2o0ePqkuXLvL395eLi4v99bzxUGqxYsXUokULrV69WsgfRZ1dAMzhm2++UeXKlfXoo49m29+8eXNVrlxZ3333nb3tr7/+0rx58+x/RMuWLaslS5aoU6dOWrp0qbp37y4vLy899dRTWr58uaZNm6YiRYrYH//ZZ5/JMAyFh4ffVa3btm3T5s2b1a1bN1WoUEHHjx/XnDlz1KJFC+3bt0/FixfP9nGHDx9WixYtVLx4cb366qsqXry45s+fr9DQUEVGRqp58+Z3VcfNfPHFF7p06ZIGDBig0qVLa+vWrZo1a5ZOnz7tMP8kOjpaXbt2Vf369TV58mRZrVYlJiZq+PDhd7yu6dOnq0yZMrLZbPrkk0/Ur18/Va5cWaGhoTmu/05eV0n6+++/1aJFCx0+fFiDBw9WYGCgvvjiC/Xq1UvJyclZQvDixYt1/vx5DRo0SJcvX9Z7772nli1bavfu3fL19c22lrVr16pPnz4aPHiwRo0aZW+PjIzU0aNH1bt3b/n5+Wnv3r2aN2+e9u7dq19//fWWQefMmTM6efKkGjRocMvnYfbs2XrzzTfVqVMnzZkz57bP26effmr/942v4Z2+Zy9cuKBHH31U+/fvV58+fdSgQQMlJibq66+/1unTp1WmTBmlp6erffv2ioqKUrdu3fTSSy/p/PnzioyM1J49e1S1atVs6zt8+LDmz5+fpf3o0aPq2LGjqlWrprfffltr1qyxB41BgwZpyJAh+v333zV9+nSVLVtWo0ePvulzsHHjRn3//fe3fa527typjh07qm3btpo9e/ZdP0//VHJysv0/YtdLT09Xhw4ddOLECQ0bNkz333+/LBaLJk2alO1yGjZsqNWrV8tms8nb2ztXasMtOHsXFe59ycnJhiTjySefvOW4Dh06GJIMm81mGIZhSDIkGT///LN9zKVLl4xatWoZfn5+xpUrVwzDMIwff/zRkGT88MMPDsurV6+e8dhjj9nv9+7d26hYsWKW9Uoyxo0b57COG0VHRxuSjMWLF9vbfvrpJ0OS8dNPPxmGYRidO3c2ihQpYuzZs8c+JjEx0ShdurTRsGFDe9uCBQsMSca2bdvsbWfPns1Sh2EYRrt27YxKlSo5tGVXX0REhGGxWIwTJ07Y20aPHm1IMv78809727FjxwxJxtSpU7Ms43qZNR47dsze9scffxiSjClTptjbevbsaXh6et5yWTdu152+rjNmzDAkGUuWLLGPu3LlihEcHGx4eXnZ3yeZ2+Th4WGcPn3aPnbLli2GJGP48OEO9WY+n7/99pvh5eVldOnSxUhPT3eoObvn+LPPPjMkGRs3brzl9q5bt86QZHzzzTdZ+jIPmS1fvtxwcXExJBnLli275fJef/11w2KxOLRVqlTJ6Nmz5y3rze49O3bsWENSlvl6hmHYD+998sknhiRj2rRpNx2T+ZwvWLDA3te1a1ejTp06RkBAgENtQ4cONUqUKGEkJiYahmEYaWlpxsMPP2xIMrZs2WIf1717d6NcuXLG5cuXDcPI+vkyDMNo0qSJ8fjjj2d5T40bN85+CO748eNG+fLljWbNmhl///23Q/13+jxl9xk1jJt/Tm9sGzlypFGuXDmjYcOGDn+DDh48mO38uewOpRqGYSxbtizL84S8wyEz5Lnz589LkkqUKHHLcZn9NpvN3ta4cWM99thj9vseHh4aOHCg4uLitH37dklSaGio/P39tXTpUvu4PXv2aNeuXXruuefsbeXKlVNCQoKuXLlyyzo8PDzs/05LS9Nff/2latWqycfHx77O66WkpCghIUGRkZEKCwtT7dq17X2lS5dWr169FBMTo/j4+Fuu905dX9/FixeVmJioRx55RIZh6Pfff7f3nT9/Xi4uLvLx8cnxus6dO6fExEQdPXpU06dPV5EiRRxej0yJiYlKTEzU5cuX72i5d/K6fv/99/Lz87PvMZKuHUYYOnSoLly4oA0bNjgss2PHjrrvvvvs9x966CE1adIk2z0KR48eVbt27fTAAw/o008/lYuL45/C65/jy5cvKzExUQ8//LAkZfseuF7mYbWSJUtm23/q1Ck9//zzql+/vurXr6+XX35ZKSkpN13elStX5Obmdst13ul79ssvv1T9+vX11FNPZVlG5l6vL7/8UmXKlNGQIUNuOuZGMTEx+uKLLxQREZHluYyKilLz5s1VunRpSVLRokXVsGFDSddeo0ydOnVSQkLCTU81/+qrr7Rt2zaHQ5s3+uuvvxQWFqYSJUro66+/lru7u0P/3X62c+LMmTOaNWuWxowZk+Vwcubfwszn4nYy30PXH85E3iEQIc9lBp3
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjsAAAHHCAYAAABZbpmkAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA+MklEQVR4nO3deXgNd///8VcS2WQlIqFFbEUQe0mtJYTGVnRBi9ZdLVFUb1Tv1ta6LVVrbXcXS1G9daGoJSiKFLXvP9UoLUnEkoQ2iSTz+6NXztdxEssRTsz9fFzXua6cz3xm5j2Tc3JemfnMHCfDMAwBAACYlLOjCwAAALifCDsAAMDUCDsAAMDUCDsAAMDUCDsAAMDUCDsAAMDUCDsAAMDUCDsAAMDUCDsAHjjDMHTp0iWdPHnS0aUA+B9A2AHwQKSmpuqdd95RpUqV5ObmpoCAAD322GM6ceKEo0vLN/369VPLli0dXQYeQm+99Zbq16/v6DJMi7CDOzZ//nw5OTnp559/tpn28ccfy8nJSR07dlRWVpYDqkNBdvHiRYWHh2v69Onq0qWLVqxYoZiYGG3evFkhISGOLi9fxMXF6ZNPPtHbb79taTt9+rScnJw0adIkm/579+6Vr6+vKlSooISEhAdZqkN4e3urV69eji6jwBo0aJAOHDig7777ztGlmFIhRxeAh9+3336rvn37qnHjxlq6dKlcXFwcXRIKmCFDhuj8+fOKjY1V1apVHV3OfTFt2jSVLVtWTz755G37njx5Um3atJGXl5fWr1+voKCgB1AhCrLg4GB16NBBkyZNUvv27R1djulwZAf3ZPPmzeratatCQ0O1cuVKeXh4OLokFDCJiYlasGCBxo8fb9qgc/36dS1evFjPPvvsbfueP39ekZGRSktL05o1a1SuXLkHUCEeBs8++6y2bdumX3/91dGlmA5hB3bbv3+/OnTooBIlSmjdunXy8/Oz6bNs2TLVqVNHnp6eKlasmF544QX98ccfVn169eolb29vm3m/+uorOTk5afPmzZKkZs2aycnJ6ZaPHE5OTurfv78WL16sSpUqycPDQ3Xq1NHWrVtt1rNv3z61adNGvr6+8vb2VosWLfTTTz/lus151TB//nyrPtWqVbvt/sup8WZt27a1ObUzadIkPfHEEwoICJCnp6fq1Kmjr776ymbeq1ev6s0331S5cuXk6upqVWNSUtIt67l524oVK6aoqCgdPnz4jurOkXO68/Tp05Kk3bt3Kzs7WxkZGapbt648PDwUEBCgrl276syZMzbzb9q0SY0bN5aXl5f8/f3VoUMHHTt2zKrPqFGj5OTkpOPHj+vZZ5+Vr6+vAgICNHDgQKWlpdnUO2rUKMvzzMxMPfXUUypatKiOHj1qaZ83b56aN2+u4sWLy93dXaGhoZo9e/Yt91mObdu2KSkpSREREbfsd+XKFbVu3Vrnzp3T8uXLVbNmzVz7hYSE5Po6y3kvSNKKFSsUFRWlkiVLyt3dXeXLl9d7772X62nknTt36qmnnlKRIkXk5eWlsLAwTZs2zapPzr4MDAyUp6enKlWqpH/961+W6Tn7/EZXr15VcHCwTW3Z2dkaMmSI/Pz8FBISorVr11qmDRs2TD4+PqpYsaLWrFljtbxevXrZvPbPnj0rT09Pq9dUzj66+bRYnz595OHhYdd+yut9O2nSJJt13/waz9nmsLAwm78H0t9/y+rWrSsfHx+r3+fNpzdzXj8rVqywqQP3htNYsMupU6fUunVrubu7a926dSpRooRNn/nz5+ull15SvXr1NG7cOCUkJGjatGnavn279u3bJ39//7ta57/+9S/94x//kCQlJSXpjTfeUJ8+fdS4ceNc+2/ZskVffvmlBgwYIHd3d82aNUutW7fWrl27LH/Ujhw5osaNG8vX11dDhw6Vq6ur5s6dq2bNmmnLli25DhisXLmy5UMgp477bdq0aWrfvr26d++ujIwMLV26VM8884xWrVqlqKgoS78hQ4Zozpw56t27txo2bChXV1d98803+vbbb+9oPTnbZhiGTp06pcmTJ+upp57KNZTcqYsXL0qS+vfvrzp16mj8+PG6cOGCpk+frm3btmnfvn0qVqyYJGnDhg1q06aNypUrp1GjRumvv/7SjBkz1LBhQ+3du9fmg/DZZ59VSEiIxo0bp59++knTp0/X5cuXtXDhwjzr+cc//qHNmzcrJiZGoaGhlvbZs2eratWqat++vQoVKqSVK1eqX79+ys7OVnR09C23cceOHXJyclKtWrXy7JOWlqb27dvr8OHD+u9//3vb012NGzdWnz59JEnHjh3Tv//9b6vp8+fPl7e3twYPHixvb29t2rRJI0aMUEpKij744ANLv5iYGLVt21YlSpTQwIEDFRwcrGPHjmnVqlUaOHCgJOngwYNq3LixXF1d1adPH4WEhOjUqVNauXKlxo4dm2eNH374Ya7jjSZMmKBJkybpxRdfVJ06dfTGG28oIyNDq1evVs2aNTV27Fh98skn6tSpk44ePaqyZcvmuY4RI0bYBNjcjBw5Up9++qm+/PJLNWvW7K730736/PPPdejQIZv22NhYPfvss6pRo4bGjx8vPz+/PP9u+Pn5qXz58tq+ffsD+bvyP8UA7tC8efMMScaqVauM8uXLG5KMVq1a5do3IyPDKF68uFGtWjXjr7/+srSvWrXKkGSMGDHC0tazZ0/Dy8vLZhnLli0zJBk//PCDzbS4uDhDkjFv3rxc1y/JkGT8/PPPlrbffvvN8PDwMJ5++mlLW8eOHQ03Nzfj1KlTlrZz584ZPj4+RpMmTWyW27BhQ+PJJ5+8ZR1NmzY1qlatmmtdN9cYHR1t0x4VFWWUKVPGqu3PP/+0ep6RkWFUq1bNaN68uVV7iRIljMjISKu2kSNHGpKMCxcu3LKepk2bGk2bNrVqe/vttw1JRmJi4m3rzpHzOomLi7N6HhoaarUdP/zwgyHJePPNNy1tNWvWNIoXL25cvHjR0nbgwAHD2dnZ6NGjh802tW/f3mrd/fr1MyQZBw4csKp35MiRhmEYxvDhww0XFxdj+fLlNnXfvI8NwzAiIyONcuXK5bmtOV544QUjICDApj3n9TF+/HijQ4cOhiQjMDDQSE5OvuXyHnnkEeOll16yPM/ZVze+F3Kr99VXXzUKFy5spKWlGYZhGJmZmUbZsmWNMmXKGJcvX7bqm52dbfm5SZMmho+Pj/Hbb7/l2Sdnn+dITEw0fHx8jDZt2ljVlpaWZhQvXtzo2rWrpe+BAwcMFxcXo0aNGkZ6erphGIaRlJRk+Pj4GAMHDrT069mzp9Vr//Dhw4azs7NlHTmvKcMwjDJlyhg9e/Y0DMMw5s6da0gyZsyYYbNP7mQ/GUbe79sPPvjAZt03v8bT0tKM0qVLW+q88e/B8OHDDUnG+fPnLW05r4sPPvjAZn2tWrUyqlSpYtOOe8NpLNy1Xr166ezZs+rWrZvWr1+vZcuW2fT5+eeflZiYqH79+lmN44mKilLlypW1evVqm3mSkpKsHqmpqfdUZ3h4uOrUqWN5Xrp0aXXo0EHr1q1TVlaWsrKytH79enXs2NFq3ESJEiXUrVs3bdu2TSkpKVbLzMjIkLu7+23XnZWVZdmOjIyMPPulpaXZbPf169dt+nl6elp+vnz5spKTk9W4cWPt3bvXql9qaqoCAgJuW19erl+/rqSkJF24cEGxsbH69ttvFRYWZjnycnPdFy9eVHZ29h0tOzo62mo7mjVrpjp16lheC+fPn9f+/fvVq1cvFS1a1NIvLCxMLVu21Pfff5/rMm/0+uuvS1KufT/66CONGzdO06dPV4cOHWym31hbcnKykpKS1LRpU/36669KTk6+5bZdvHhRRYoUyXP6pEmTtGLFCvXo0UMXL17U8OHDb7m8O3md3VhvamqqkpKS1LhxY/355586fvy4pL9P0cbFxWnQoEE2R1JzTklduHBBW7du1csvv6zSpUvn2ic37733nvz8/DRgwACr9kOHDikxMVGdOnWytIWFhcnDw0M1a9aUm5ubJCkgIEBNmjTRxo0b81zH8OHDVbt2bT3zzDN59lm
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjsAAAHHCAYAAABZbpmkAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA9CElEQVR4nO3deXxMZ///8fcksssiREIba7TEVrtQpRJJCaroRhVVlFBLb9q0tba9Fa2tlNbdUtWWrtpSSyxVJXbaWm+1lEojUiShJJI5vz/6y3yNhDKGiXO/no/HPB7mOtc58znHTPLOda5zxmIYhiEAAACTcnN1AQAAADcTYQcAAJgaYQcAAJgaYQcAAJgaYQcAAJgaYQcAAJgaYQcAAJgaYQcAAJgaYQfAdbFarUpPT9ehQ4dcXQoAXBPCDoB/lJqaqsGDB6t8+fLy9PRUSEiIIiMjlZmZ6erSnKZNmzbq3bu3q8vAbeixxx7TI4884uoycBWEHdxSFovlmh7ff/+9q0vF//frr7+qQYMGWrBggfr27avFixcrKSlJq1atkp+fn6vLc4r169drxYoVev75521t33//vSwWiz7//PMC/ZcuXSoPDw81btxY586du5Wl3nLp6emyWCwaPXq0q0spsp5//nl98cUX+umnn1xdCq6gmKsLwP+WDz/80O75vHnzlJSUVKC9WrVqt7IsXEXfvn3l6empjRs36o477nB1OTfFxIkTFR0drYiIiH/sm5ycrM6dOysiIkJLliwxTeCD4+rUqaP69evrzTff1Lx581xdDgpB2MEt9cQTT9g937hxo5KSkgq0o2jYtm2bVq9erRUrVpg26KSlpWnJkiWaNWvWP/bds2eP2rZtqxIlSmj58uUqWbLkLagQt4NHHnlEo0aN0ttvv63ixYu7uhxchtNYKNKys7M1atQoRUREyMvLS+Hh4Ro+fLiys7ML9J0/f74aNmwoX19flShRQvfdd59WrFghSapQocJVT5tVqFDBtp1z587pueeeU3h4uLy8vHT33XfrjTfekGEYdq936fru7u6644471KdPH505c8bWJycnRyNHjlS9evUUGBgoPz8/NWvWTGvWrClQf1pamnr16qVy5crJ3d3dtu1r+cF56f65ubkpLCxMjz76qI4ePWrrc+TIEVksFr3xxhtX3M7o0aNlsVhszzdu3Chvb28dPHhQ1atXl5eXl8LCwtS3b1+dOnWqwPqfffaZ6tWrJx8fH5UqVUpPPPGEjh8/btenR48eKl68uA4dOqS4uDj5+fmpbNmyGjt2rN0xzq937ty5trasrCzVq1dPFStW1B9//GFrf+ONN9SkSROVLFlSPj4+qlevXqGnnwqzZMkS5ebmKiYm5qr9jh49qri4OBmGoeXLl6tcuXKF9rvSe+zIkSO2PnPmzFHLli1VunRpeXl5KTIyUjNnzix0e0uXLlXz5s3l7++vgIAANWjQQB9//LFdn02bNqlNmzYqUaKE/Pz8VKtWLU2dOtW2vEePHnbvcUk6duyYfHx8CtT2119/qWfPnvLz81NkZKS2bdsmSbp48aJ69uwpX19f1a5dW1u3brXbXosWLdSiRQu7ti1bttj2//JjdOlpsdzcXLVp00bBwcHas2fPdR+nChUqqG3btgXaBwwYUOC1L3+PS9LZs2cVFhZW6Cn0mTNnqkaNGvL19bX7/7z8/dWqVSudO3dOSUlJBeqA6zGygyLLarWqffv2+vHHH9WnTx9Vq1ZNv/zyiyZPnqz//ve/WrRoka3vmDFjNHr0aDVp0kRjx46Vp6enNm3apNWrVys2NlZTpkzR2bNnJUl79+7Vv//9b7344ou202X5gcIwDLVv315r1qxRr169dM8992j58uUaNmyYjh8/rsmTJ9vV+NBDD6ljx47Kzc1VcnKy3n33XZ0/f952Wi4zM1P/+c9/9Pjjj6t3797KysrSe++9p7i4OG3evFn33HOPbVvdu3fXypUrNXDgQNWuXVvu7u569913tX379ms6Xs2aNVOfPn1ktVq1a9cuTZkyRSkpKVq3bp2j/wX6888/deHCBfXr108tW7bUM888o4MHD2rGjBnatGmTNm3aJC8vL0nS3Llz1bNnTzVo0EDjxo3TiRMnNHXqVK1fv147duxQUFCQbbt5eXl64IEH1LhxY02YMEHLli3TqFGjlJubq7FjxxZay8WLF9WpUycdPXpU69evV5kyZWzLpk6dqvbt26tr167KycnRggUL9PDDD2vx4sWKj4+/6j5u2LBBJUuWVPny5a96HOLi4vTnn39q5cqVql69+lW3mf++kKR169bp3XfftVs+c+ZMVa9eXe3bt1exYsX07bffqn///rJarUpISLD1mzt3rp566ilVr15diYmJCgoK0o4dO7Rs2TJ16dJFkpSUlKS2bduqTJkyGjRokMLCwrR3714tXrxYgwYNumKNI0eO1IULFwq0DxkyRB988IEGDBigO++8U/3795ckvfvuu2rZsqVeffVVTZ06Va1bt9ahQ4fk7+9/xde4dA7U1Tz99NP6/vvvlZSUpMjIyOs+TjfqzTff1IkTJwq0L1y4UP3791eLFi00cOBA+fn52X5+XC4yMlI+Pj5av369HnroIafVBicxABdKSEgwrvQ2/PDDDw03Nzdj3bp1du2zZs0yJBnr1683DMMwDhw4YLi5uRkPPfSQkZeXZ9fXarUW2O6aNWsMScaaNWsKLFu0aJEhyXj11Vft2jt37mxYLBbj119/tbVJMkaNGmXXr0mTJkZkZKTteW5urpGdnW3X5/Tp00ZoaKjx1FNP2drOnz9vuLm5GX379rXr2717d8PPz69AnZcrX7680b17d7u2Ll26GL6+vrbnhw8fNiQZEydOvOJ2Ro0aZff/kf88OjrayM3NtbXPmTPHkGS89dZbhmEYRk5OjlG6dGmjRo0axvnz5239Fi9ebEgyRo4cabdPkoyBAwfa2qxWqxEfH294enoaJ0+etKt3zpw5htVqNbp27Wr4+voamzZtKlD3X3/9Zfc8JyfHqFGjhtGyZcsr7mu+e++916hXr16B9vz3ydy5c42GDRsakozq1asbFy9evOK2Ll68aEgyxowZY2vLP1aHDx++Yr2GYRhxcXFGpUqVbM/PnDlj+Pv7G40aNbI7pobxf+/r3Nxco2LFikb58uWN06dPF9rHMP4+5uXLl7c937Vrl+Hm5ma0bt3arrbU1FTD09PTSExMtPXN/z9s06aNbZt79+41LBaLMXnyZFu/5s2bG82bN7c9/+677wxJxgMPPFDgM37pZycxMdFwd3c3Fi1aVOCYXMtxMoy/3//x8fEF+hb28+Xy93haWprh7+9vOxaX/lx4/PHHjaCgILvjn/+++Oyzzwq83l133WW0bt26QDtcj9NYKLI+++wzVatWTVWrVlV6errt0bJlS0mynQpatGiRrFarRo4cKTc3+7f05cPV/+S7776Tu7u7nn32Wbv25557ToZhaOnSpXbtf/31l9LT05Wammq7GiM6Otq23N3dXZ6enpL+Hqk6deqUcnNzVb9+fbsRm3Pnzslqtd7QHJDs7Gylp6crLS1NSUlJWr16tV0tl9d8+vTpAqfmrmTo0KFyd3e3Pe/WrZtCQ0O1ZMkSSdLWrVuVlpam/v37y9vb29YvPj5eVatWtfW71IABA2z/tlgsGjBggHJycrRy5coCfYcNG6aPPvpIn376qRo2bFhguY+Pj+3fp0+fVkZGhpo1a3ZNo2J//vmnSpQoccXl//rXv7RlyxZ1795du3fvLjC6d6mcnBxJso12Xcml9WZkZCg9PV3NmzfXoUOHlJGRIenvEZusrCy98MILdsdU+r/39Y4dO3T48GENHjzYbuTs0j6FSUxMVN26dfXwww/bta9bt045OTm2USlJatSokSSpQYMGtm1WrVpVkZGRWrVqVaHbNwxDiYmJ6tSpk239wkyfPl3jxo3TtGnT9OCDDxZYfi3H6Ua98sorCgwMLPCZl/4+berr61vg+F9JiRIllJ6e7pS64FyEHRRZBw4c0O7duxUSEmL3uOuuuyT9PcdFkg4ePCg3Nze74W9
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение целевой переменной для регрессии:\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjsAAAHHCAYAAABZbpmkAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABY7klEQVR4nO3dd1gU1/s28HvpZekdRUURRaxBRewKBkuIvZIoajSxxRY1JrHFGOvXEms0UWOLJUYTTcQg1hgrESsaVBSjgCABBKXuef/wx7yuFGFZXBzvz3Xtdblzzpx55jjC4zlnZhRCCAEiIiIimdLTdQBERERE5YnJDhEREckakx0iIiKSNSY7REREJGtMdoiIiEjWmOwQERGRrDHZISIiIlljskNERESyxmSHiIiIZI3JDhHJytGjR6FQKPDTTz/pOhSdOHv2LIyMjHD37l1dh/LGyMnJgZubG1atWqXrUKgITHboldu4cSMUCoX0MTExgaenJ0aPHo2EhARdh0cV2NGjR9GjRw84OzvDyMgIjo6OCAoKws8//6zr0CqMzz//HP3790fVqlWlbW3btlX7N2dra4smTZpg/fr1UKlUOoxWHgwNDTFhwgTMmTMHmZmZug6HCsFkh3Tmyy+/xObNm7FixQo0b94cq1evhp+fH548eaLr0KgCmjFjBtq1a4crV67gww8/xJo1azBp0iSkp6ejZ8+e2LZtm65D1LnIyEgcOnQIH330UYGyypUrY/Pmzdi8eTOmTZuG3NxcDB06FJ999pkOIpWfwYMHIykpiddhRSWIXrENGzYIAOLcuXNq2ydMmCAAiG3btukoMqqodu3aJQCIXr16iezs7ALloaGhYt++fUIIIY4cOSIAiF27dpVrTOnp6eXaviY+/vhjUaVKFaFSqdS2t2nTRnh7e6tty8jIEJUrVxbm5uaF9imV3jvvvCNatWql6zCoEBzZoQqjffv2AICYmBgAQHJyMj755BPUq1cPSqUSlpaW6NSpEy5evFhg38zMTMycOROenp4wMTGBi4sLevTogVu3bgEA7ty5ozaM/+Knbdu2Ulv5az527NiBzz77DM7OzjA3N8e7776Le/fuFTj2mTNn0LFjR1hZWcHMzAxt2rTByZMnCz3HF6cT8j8zZ84sUHfLli3w8fGBqakpbG1t0a9fv0KPX9y5PU+lUmHp0qXw9vaGiYkJnJyc8OGHH+K///5Tq1etWjW88847BY4zevToAm0WFvvChQsL9CkAZGVlYcaMGfDw8ICxsTHc3NwwefJkZGVlFdpXz5s2bRpsbW2xfv16GBoaFigPDAwsELNKpcKcOXNQuXJlmJiYwN/fHzdv3lSrc+LECfTu3RtVqlSRYho/fjyePn2qVi8kJARKpRK3bt1C586dYWFhgeDgYADA06dP8fHHH8Pe3h4WFhZ49913cf/+/UL75v79+xgyZAicnJxgbGwMb29vrF+/vsD5LF++HN7e3jAzM4ONjQ0aN25cohGDvXv3on379gX+ngpjZmaGZs2aISMjA4mJidL2klzPM2fOLPR6MzAwkOq0bdsWdevWRUREBJo3bw5TU1O4u7tjzZo1BWIpzbWxZcsWNG3aVOqb1q1b448//lCrc+DAAbRp0wYWFhawtLREkyZN1Pqvbdu2Ba7Pc+fOFfrvJj09HRMnTkT16tVhaGiodr5JSUlqdTt06IA///wTycnJRfQ66YrBy6sQvRr5iYmdnR0A4Pbt29i7dy969+4Nd3d3JCQk4Ntvv0WbNm1w7do1uLq6AgDy8vLwzjvvIDw8HP369cPYsWPx+PFjhIWF4cqVK6hRo4Z0jP79+6Nz585qx506dWqh8cyZMwcKhQJTpkzBw4cPsXTpUgQEBCAyMhKmpqYAgMOHD6NTp07w8fHBjBkzoKenhw0bNqB9+/Y4ceIEmjZtWqDdypUrY+7cuQCe/SAdMWJEoceeNm0a+vTpgw8++ACJiYlYvnw5WrdujQsXLsDa2rrAPsOHD0erVq0AAD///DP27NmjVv7hhx9i48aNGDx4MD7++GPExMRgxYoVuHDhAk6ePFloElFaKSkp0rk9T6VS4d1338Wff/6J4cOHw8vLC5cvX8aSJUvwzz//YO/evUW2GR0djevXr2PIkCGwsLAocSzz5s2Dnp4ePvnkE6SmpmLBggUIDg7GmTNnpDq7du3CkydPMGLECNjZ2eHs2bNYvnw5/v33X+zatUutvdzcXAQGBqJly5ZYtGgRzMzMADxLhHbu3In3338fzZo1w7Fjx9ClS5cC8SQkJKBZs2ZQKBQYPXo0HBwccODAAQwdOhRpaWkYN24cAGDdunX4+OOP0atXL4wdOxaZmZm4dOkSzpw5gwEDBhR5vvfv30dsbCzeeuutEvfR7du3oa+vL11Ppb2eV69eDaVSKX3X01P///N///2Hzp07o0+fPujfvz927tyJESNGwMjICEOGDAFQumtj1qxZmDlzJpo3b44vv/wSRkZGOHPmDA4fPoy3334bwLM1gUOGDIG3tzemTp0Ka2trXLhwAaGhocX235QpUwrdPmnSJKxZswZDhw5FixYtYGhoWOi/LwDw8fGBEAJ//fVXof9hIB3S9dASvXnyp7EOHTokEhMTxb1798T27duFnZ2dMDU1Ff/++68QQojMzEyRl5entm9MTIwwNjYWX375pbRt/fr1AoBYvHhxgWPlD+fHxMQIAGLhwoUF6nh7e4s2bdpI3/OnQSpVqiTS0tKk7Tt37hQAxLJly6S2a9asKQIDA9WmDZ48eSLc3d1Fhw4dChyrefPmom7dutL3xMREAUDMmDFD2nbnzh2hr68v5syZo7bv5cuXhYGBQYHt0dHRAoD44YcfpG0zZswQz//zPnHihAAgtm7dqrZvaGhoge1Vq1YVXbp0KRD7qFGjxIs/Ml6MffLkycLR0VH4+Pio9enmzZuFnp6eOHHihNr+a9asEQDEyZMnCxwv3y+//CIAiCVLlhRZ53n5f39eXl4iKytL2r5s2TIBQFy+fFna9uTJkwL7z507VygUCnH37l1p26BBgwQA8emnn6rVjYiIEADEuHHj1LaHhIQU6JuhQ4cKFxcXkZSUpFa3X79+wsrKSoqla9euBaacSuLQoUMCgDSd97w2bdqI2rVri8TERJGYmCiioqLExx9/LACIoKAgIUTpruf86ysxMbHIeNq0aSMAiP/973/StqysLNGwYUPh6OgoTZ2V9NqIjo4Wenp6onv37gV+LuTHm5KSIiwsLISvr694+vRpoXXyY3v++vz9998FANGxY8cC17iLi4sIDAxU21bU+T948EAAEPPnzy+yX0g3OI1FOhMQEAAHBwe4ubmhX79+UCqV2LNnDypVqgQAMDY2lv6nmJeXh0ePHkGpVKJWrVr4+++/pXZ2794Ne3t7jBkzpsAxSjKcX5SBAweqjST06tULLi4u+P333wE8WwwaHR2NAQMG4NGjR0hKSkJSUhIyMjLg7++P48ePF7jTJTMzEyYmJsUe9+eff4ZKpUKfPn2kNpOSkuDs7IyaNWviyJEjavWzs7MBPOuvouzatQtWVlbo0KGDWps+Pj5QKpUF2szJyVGrl5SU9NK7TO7fv4/ly5dj2rRpav/bzz++l5cXateurdZm/tTli8d/XlpaGgCUalQHeLZg1MjISPqeP+p1+/ZtaVv+CB0AZGRkICkpCc2bN4cQAhcuXCjQ5oujcKGhoQCAkSNHqm1/8VoUQmD37t0ICgqCEEKtDwIDA5Gamipd09bW1vj3339x7ty5Up3vo0ePAAA2NjaFll+/fh0ODg5wcHCAl5cXli9fji5dukjTaJpczy9jYGCADz/8UPpuZGSEDz/8EA8fPkRERASAkl8be/fuhUqlwvTp0wuMIOX/Ow8LC8Pjx4/x6aefFvh3VtTPAiEEpk6dip49e8LX17dA+ePHj6XR5pfJ7/sXp7dI9ziNRTqzcuVKeHp6wsDAAE5OTqhVq5baDzGVSoVly5Zh1apViImJQV5enlT2/A+fW7duoVatWmrrBbShZs2aat8VCgU8PDxw584dAM+mVwBg0KBBRbaRmpqq9ss
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import matplotlib.pyplot as plt\n",
"\n",
"# Функция для визуализации распределения целевой переменной\n",
"def plot_balance(y, title):\n",
" counts = y.value_counts()\n",
" plt.bar(counts.index.astype(str), counts.values)\n",
" plt.title(title)\n",
" plt.xlabel(\"Классы\")\n",
" plt.ylabel(\"Количество\")\n",
" plt.show()\n",
"\n",
"# Для классификации (Smoker)\n",
"print(\"Распределение классов для классификации:\")\n",
"plot_balance(y_clf_train, \"Обучающая выборка (Классификация)\")\n",
"plot_balance(y_clf_val, \"Контрольная выборка (Классификация)\")\n",
"plot_balance(y_clf_test, \"Тестовая выборка (Классификация)\")\n",
"\n",
"# Для регрессии (Charges) - визуализация распределения через гистограмму\n",
"print(\"Распределение целевой переменной для регрессии:\")\n",
"plt.hist(y_reg_train, bins=30, alpha=0.7, label=\"Обучающая\")\n",
"plt.hist(y_reg_val, bins=30, alpha=0.7, label=\"Контрольная\")\n",
"plt.hist(y_reg_test, bins=30, alpha=0.7, label=\"Тестовая\")\n",
"plt.title(\"Распределение Charges (Регрессия)\")\n",
"plt.xlabel(\"Значение Charges\")\n",
"plt.ylabel(\"Частота\")\n",
"plt.legend()\n",
"plt.show()\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Несбалансированно, нужно исправлять"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение классов после балансировки:\n",
"smoker\n",
"no 50.0\n",
"yes 50.0\n",
"Name: proportion, dtype: float64\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAlIAAAHHCAYAAAB0nLYeAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABOIklEQVR4nO3deVwV9f7H8Teo7JsoghQqLqW45E5opQlK7uaWZuWWmktu3UzvTVOra2qpaabpLcWk8qY3KysTtdQScddcMtc0FRAXQAxBmN8fPZifBxB1AgF9PR+P83hwvvOdmc+cmXPOm9mOnWEYhgAAAHDb7Au7AAAAgOKKIAUAAGARQQoAAMAighQAAIBFBCkAAACLCFIAAAAWEaQAAAAsIkgBAABYRJD6m1JSUnTq1CldvHixsEtBPmK9Fm+ZmZlKSEjQsWPHCrsUAHc5gpQFn3/+uUJDQ+Xu7i43NzdVqFBB06ZNK+yy8DexXou32NhYjRw5UhUrVpSDg4N8fHwUFBSkpKSkwi4Nd0ibNm00YMCAwi4DxVCPHj3UvXt3S+Pe80Fq//79euaZZ3TffffJ0dFR/v7+6tWrl/bv359r/7Fjx6p79+5yd3fXwoULFRUVpbVr12rIkCF3uHLkJ9Zr8XbkyBE1atRIn332mQYNGqRVq1YpKipK69atk6ura2GXhzvg559/1po1a/TKK6+YbT/++KPs7Oy0fPnyHP2/++47lSpVSg8//LBSUlLuZKl3XEJCguzs7DRx4sTCLqXIeuWVV7RixQrt2bPntsctWQD1FBv/+9//1LNnT3l7e6t///4KDAzUiRMn9OGHH2r58uX67LPP9OSTT5r9N2zYoKlTp2rKlCkaO3ZsIVaO/MR6Lf4GDRokBwcHbdmyRffdd19hl4NCMH36dIWGhqpq1ao37RsdHa2uXbuqatWq+uabbwjbUL169dSwYUO98847WrJkye2NbNyjjhw5Yri4uBjVq1c34uPjbYadO3fOqF69uuHq6mocPXrUbG/Xrp3RpEmTO10qChjrtXjbvn27IclYs2ZNYZeCQhIXF2eULFnS+M9//mPT/sMPPxiSjM8//9xs279/v+Ht7W3cd999xu+//36nSy0U586dMyQZr732WmGXUqS9/fbbhqurq5GcnHxb492zh/amT5+uK1euaMGCBfLx8bEZVrZsWX3wwQdKSUmxOUdmy5YtqlWrlnr06CFvb285OzurUaNGWrlypdnn8uXLcnV11YgRI3LM848//lCJEiU0ZcoUSVKfPn1UqVKlHP2y74L9/fffNWTIED344INydnZWmTJl1K1bN504ccJmvKzd2D/++KPZtm3bNrVs2VLu7u5ydXVV8+bNtWnTJpvxFi9eLDs7O23fvt1su9Gu4Hbt2uWoedOmTerWrZsqVKggR0dHBQQEaNSoUfrzzz9zLNvy5cvVsGFDubu7y87Ozny8/fbbOfrmVmPWw8XFRbVr19Z//vMfm359+vSRm5tbntPKvly3sl6zxMfHq3///vL19ZWTk5MeeughRURE2PQ5ceKEuUwzZ85UxYoV5ezsrGbNmmnfvn056s3+ei5dulT29vZ66623zLa9e/eqT58+qly5spycnOTn56d+/frp/PnzeS6r9P/bxY0effr0sel/7NgxdevWTd7e3nJxcdHDDz+sb775Jsd0U1NTNXHiRD3wwANycnJS+fLl1blzZx09etSm38SJE29pvqdPn1a/fv3k6+srR0dH1axZUx999NFNl2/Lli1ycnLS0aNHVbNmTTk6OsrPz0+DBg3ShQsXbPrezrYq6Yav2fXvvUWLFqlFixYqV66cHB0dFRQUpHnz5uWYVqVKldSuXbsc7cOGDZOdnV2O9qVLl6px48ZycXFR6dKl9dhjj2nNmjU208v+Gn7++eeys7Oz2aZuZ3uUpPXr1+vRRx+Vq6urvLy81LFjRx08eNCmT/Z16u7ursaNG+d4zzRv3lzNmze3aXvzzTdlb2+vTz75JM9+27ZtM6d/M998842uXbumsLCwPPudPHlS4eHhMgxD33//vSpUqJBrv/xc79JfhxGbNWsmd3d3eXh4qFGjRjbLL0kxMTFq06aNSpcuLVdXV9WpU0fvvvuuOTy3z4pTp07J2dk5R21XrlxR37595erqqqCgIO3YsUOSlJ6err59+8rFxUUPPfSQzWe+dHvrIfvn6LVr19SmTRt5e3vrwIEDt/063c77I2v7u97ly5fl5+eX4ztQkubNm6datWrJxcXFZn1mP+TbsmVLpaSkKCoqKkcdeblnD+19/fXXqlSpkh599NFchz/22GOqVKmSzRfI+fPntWDBArm5uWn48OHy8fHR0qVL1blzZ0VGRqpnz55yc3PTk08+qWXLlmnGjBkqUaKEOf6nn34qwzDUq1ev26p127Zt2rx5s3r06KH7779fJ06c0Lx589S8eXMdOHBALi4uuY535MgRNW/eXC4uLnr55Zfl4uKihQsXKiwsTFFRUXrsscduq44b+fzzz3XlyhUNHjxYZcqU0datWzVnzhz98ccf+vzzz81+0dHR6t69ux566CG99dZb8vT0VEJCgkaNGnXL85o5c6bKli2rpKQkffTRRxowYIAqVap00w/QvNzKepWkP//8U82bN9eRI0c0bNgwBQYG6vPPP1efPn106dKlHOF5yZIlSk5O1tChQ5Wamqp3331XLVq00C+//CJfX99ca1mzZo369eunYcOG2RxmjIqK0rFjx9S3b1/5+flp//79WrBggfbv368tW7bc0pfN8OHD1ahRI5u2559/3uZ5XFycmjRpoitXrmj48OEqU6aMIiIi1KFDBy1fvtw81J2RkaF27dpp3bp16tGjh0aMGKHk5GRFRUVp3759qlKlSo75f/zxx+bf2dd5XFycHn74YdnZ2WnYsGHy8fHRd999p/79+yspKUkjR4684XKdP39eqampGjx4sFq0aKEXXnhBR48e1dy5cxUTE6OYmBg5OjpKuvVt9XpPPvmkOnfuLOmvILZgwQKb4fPmzVPNmjXVoUMHlSxZUl9//bWGDBmizMxMDR069IZ152XSpEmaOHGimjRposmTJ8vBwUExMTFav369WrVqles4165d07/+9a8bTvNWtse1a9eqdevWqly5siZOnKg///xTc+bMUdOmTbVz584cX+RZ6zQhIUHvv/++unXrpn379unBBx/MtYZFixbp1Vdf1TvvvKOnn346z9fg+nOdbmbz5s0qU6aMKlaseMM+58+fV3h4uM6fP6+1a9eqZs2aeU4zv9b74sWL1a9fP9WsWVPjxo2Tl5eXdu3apdWrV5uvQVRUlNq1a6fy5ctrxIgR8vPz08GDB7Vq1apc/ynPMmHCBKWmpuZoHzVqlCIiIjRs2DDdf//95rmeCxYsUIsWLfTGG2/o3XffVevWrXXs2DG5u7vfcB63uh6ef/55/fjjj4qKilJQUNBtv05/1zvvvKO4uLgc7cuWLdOQIUPUvHlzvfjii3J1ddXBgwf173//O0ffoKAgOTs76+eff7Y5reemCmT/WBF36dIlQ5LRsWPHPPt16NDBkGQkJSUZhmEYkgxJxo8//mj2uXLlilGjRg3Dz8/PSEtLMwzDML7//ntDkvHdd9/ZTK9OnTpGs2bNzOd9+/Y1KlSokGO+yrYL9sqVKzn6REdHG5KMJUuWmG1Zu7F/+OEHwzAMo0uXLkaJEiWMffv2mX0SEhKMMmXKGA0aNDDbFi1aZEgytm3bZrbdaFdw27ZtjYoVK9q05VbflClTDDs7O5td5+PGjTMkGWfPnjXbjh8/bkgypk+fnmMa18uq8fjx42bbb7/9Zkgypk2bZrb17t3bcHV1zXNa2ZfrVtfrrFmzDEnG0qVLzX5paWlGSEiI4ebmZm4nWcvk7Oxs/PHHH2bfmJgYQ5IxatQom3qzXs/t27cbbm5uRrdu3YyMjAybmnN7jT/99FNDkrFx48Y8lze3wxtZXF1djd69e5vPR44caUgyNm3aZLYlJycbgYGBRqVKlcy6PvroI0OSMWPGjBzTzMzMtHn+r3/9y7Czs7Npq1ixos18+/f
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from sklearn.preprocessing import LabelEncoder\n",
"from imblearn.over_sampling import SMOTE\n",
"\n",
"# Кодирование категориальных признаков\n",
"label_encoders = {}\n",
"for column in [\"sex\", \"region\"]: # Категориальные столбцы\n",
" le = LabelEncoder()\n",
" X_clf_train[column] = le.fit_transform(X_clf_train[column])\n",
" label_encoders[column] = le\n",
"\n",
"# Применение SMOTE\n",
"smote = SMOTE(random_state=42)\n",
"X_clf_train_balanced, y_clf_train_balanced = smote.fit_resample(X_clf_train, y_clf_train)\n",
"\n",
"# Проверка нового распределения классов\n",
"print(\"Распределение классов после балансировки:\")\n",
"balanced_counts = y_clf_train_balanced.value_counts(normalize=True) * 100\n",
"print(balanced_counts)\n",
"\n",
"# Визуализация распределения после балансировки\n",
"plot_balance(y_clf_train_balanced, \"Обучающая выборка после балансировки (Классификация)\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Вот теперь лучше, идем дальше, балансируем другую"
]
},
2024-11-29 21:18:24 +04:00
{
"cell_type": "code",
2024-11-30 09:21:24 +04:00
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"До балансировки: (1663, 6) (1663,)\n",
"После балансировки: (1665, 6) (1665,)\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABKUAAAJOCAYAAABm7rQwAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABvCklEQVR4nO3de3yP9f/H8eeGHZhthm2WmUNynEOOSw45zUiJUlIhX6Spr0Py1VeEah2/Kjl0RL9IVBRpDKE0xyzHhAixyWEbYxv2/v3htouPbczMtdPjfrt9brd9ruv9uT7v9/XZXC/Pz/u6LidjjBEAAAAAAABgI+e87gAAAAAAAACKHkIpAAAAAAAA2I5QCgAAAAAAALYjlAIAAAAAAIDtCKUAAAAAAABgO0IpAAAAAAAA2I5QCgAAAAAAALYjlAIAAAAAAIDtCKWAQiAtLU3Hjx/Xn3/+mdddAQAAKPROnz6tAwcOKCkpKa+7AgAFGqEUUEDFxsZq6NChCgoKkouLi8qXL6/atWsrMTExr7sGm3Tu3FkDBgzI8esrV66se++9Nxd7lL+cOHFCpUqV0pIlS/K6KwCAAs4Yow8//FDNmzdXyZIl5enpqSpVqujzzz/P667BJvPmzZOPj4/OnDmTo9e/9NJLcnJy0vHjx3O5Z/nHI488op49e+Z1N1DAEEoBueSrr76Sk5NTpo+6devm6nvt3btXTZo00dy5czVo0CAtXrxYUVFRWrFihUqVKpWr74X8ae3atVq2bJlGjRqVYV1cXJyee+451axZUyVLllSpUqXUqFEjvfzyy4qPj7e/s3mkbNmy+te//qUXX3wxr7sCAIXSzJkzs6x9blUNlFceffRRPfXUU6pVq5b+7//+T1FRUVq+fLm6d++e112DDS5evKhx48bpmWeekYeHR4Z1M2bMUJs2beTj4yNXV1dVrlxZ/fr106ZNm/Kox3lj1KhR+vrrr/Xbb7/ldVdQgBTP6w4Ahc0LL7ygWrVqWc9feeWVXH+PQYMGycXFRevWrdNtt92W69tH/vfmm2+qXbt2uv322x2Wb9y4UZ07d9aZM2f02GOPqVGjRpKkTZs26bXXXtOaNWu0bNmyvOhynnjqqaf03nvvaeXKlWrbtm1edwcACqUJEyaoSpUqGZbfihooL3z22Wf68ssv9fnnn+vRRx/N6+4gDyxatEi7d+/WwIEDHZafO3dO3bt3V2RkpFq1aqUXXnhBPj4+OnDggObNm6dZs2bp4MGDqlixYh713F4NGzZU48aN9fbbb+uzzz7L6+6ggCCUAnJZhw4d1KZNG+v5xx9/nKvTdDdv3qyVK1dq2bJlBFJF1LFjx/T9999r+vTpDsvj4+P1wAMPqFixYtqyZYtq1qzpsP6VV17RRx99ZGdXlZycLBcXFzk7583E3Fq1aqlu3bqaOXMmoRQA3CJhYWFq3LhxhuW5XQPllTfffFO9evUikCrCZsyYoRYtWmSovUeOHKnIyEhNmjRJQ4cOdVg3btw4TZo0ycZeXjrNNDk5We7u7ra+75V69uypcePGaerUqRlmlQGZ4fQ9IJekpqZKUrb/8/3nn3/qoYceko+Pj0qWLKnmzZvr+++/v+7r1q1bJzc3N+3bt0916tSRq6ur/P39NWjQIJ08edKh7U8//aSHHnpIlSpVkqurqwIDAzVs2DCdO3cu021nNfX+wIEDVpsZM2aobdu28vX1laurq2rXrq1p06Zl2FZW1ysaMmSInJycMiz//PPP1bRpU5UsWVJlypRRq1atHGb0VK5cWX379nV4zfz58+Xk5KTKlStbyw4cOCAnJye99dZbmjRpkoKCguTu7q7WrVtr+/btGd535cqVatmypUqVKiVvb2/df//92rVrl0Ob9GsApD9Kly6tpk2bauHChQ7t2rRp4xBISpeCIGdnZ82ZM+ea7TZu3Ght/3q+//57XbhwQe3bt3dY/sEHH+jvv//W//73vwyBlCT5+flpzJgxGZb//PPPatq0qdzc3FS1atUM32ydPHlSzz33nIKDg+Xh4SFPT0+FhYVlmJq9atUqOTk5ae7cuRozZoxuu+02lSxZ0rrO2fz581W7dm25ubmpbt26WrBggfr27evw+UmXLtz/zjvvqE6dOnJzc5Ofn58GDRqkU6dOObTbtGmTQkNDVa5cObm7u6tKlSp68sknM4yvQ4cOWrRokYwxWe9UAIAtLly4oIkTJ6patWrWaU4vvPCCUlJSMrT94Ycf1Lp1a5UuXVqenp5q0qSJw/FUunzcz+xxpeweW66WlJSk7du3KzAwUF26dJGnp6dKlSqlNm3a6KeffnJom93jZbq+fftm2u+XXnrJarN161b17dtXVatWlZubm/z9/fXkk0/qxIkTDtvK6npFmzZtkpOTk2bOnOmw/Pfff1fPnj1Vvnx5ubu7q0aNGvrvf/+bYXtXOnPmjPz9/eXk5KRVq1ZZy9u0aaO6detq8+bNuuuuu6xj8tVfnkmXvljr37+//Pz85Obmpvr162vWrFkOba7+TEuUKKHKlStr5MiRVr0tXT599Mo6dceOHSpTpozuvfdeXbhwIct2aWlpqlevXqb75mrJycmKjIzMUHcdPnxYH3zwgTp06JAhkJKkYsWK6bnnnsswSyo+Pl59+/aVt7e3vLy81K9fP509e9ahzY3W20uXLlXjxo3l7u6uDz74QJL0119/6b777lOpUqXk6+urYcOGaenSpRk+P0lav369OnXqJC8vL5UsWVKtW7fW2rVrHdqcPn1aQ4cOVeXKleXq6ipfX1916NBBv/76q0O7Dh06KCkpSVFRUdfcr0A6ZkoBuST9IOnq6nrdtnFxcbrrrrt09uxZPfvssypbtqxmzZql++67T1999ZUeeOCBLF974sQJJScna/DgwWrbtq2eeuop7du3T1OmTNH69eu1fv16qw/z58/X2bNnNXjwYJUtW1YbNmzQ5MmTdfjwYc2fPz/T7T/wwAPW9RF++uknffjhhw7rp02bpjp16ui+++5T8eLFtWjRIj399NNKS0tTeHh4tvbV1caPH6+XXnpJd911lyZMmCAXFxetX79eK1euVMeOHTN9zYULFxyKp6t99tlnOn36tMLDw5WcnKx3331Xbdu21bZt2+Tn5ydJWr58ucLCwlS1alW99NJLOnfunCZPnqwWLVro119/zRCW/N///Z8k6fjx45o6daoeeughbd++XTVq1Mi0DzNmzNCYMWP09ttvX/fb1cyuDZWVX375RWXLllVQUJDD8u+++07u7u568MEHs72tvXv36sEHH1T//v3Vp08fffrpp+rbt68aNWqkOnXqSLoUoC5cuFAPPfSQqlSpori4OH3wwQdq3bq1du7cqYCAAIdtTpw4US4uLnruueeUkpIiFxcXff/993r44YcVHBysiIgInTp1Sv379890tt+gQYM0c+ZM9evXT88++6z279+v999/X1u2bNHatWtVokQJHTt2TB07dlT58uX1n//8R97e3jpw4IC++eabDNtr1KiRJk2apB07dhSaa5sAQEH1r3/9S7NmzdKDDz6oESNGaP369YqIiNCuXbu0YMECq93MmTP15JNPqk6dOho9erS8vb21ZcsWRUZGZnpMHThwoFq2bClJ+uabbxy2JWXv2JKZ9PDn9ddfl7+/v0aOHCk3Nzd99NFHat++vaKiotSqVStJN368lKRy5co5zKZ5/PHHHdZHRUXpzz//VL9+/eTv768dO3boww8/1I4dO7Ru3bpsfZl1ta1bt6ply5YqUaKEBg4cqMqVK2vfvn1atGjRNU+5fPvttxUXF5fpulOnTqlz587q2bOnevXqpXnz5mnw4MFycXGxvjA6d+6c2rRpo71792rIkCGqUqWK5s+fr759+yo+Pl7//ve/HbaZ/pmmpKRo6dKleuutt+Tm5qaJEydm2odDhw6pU6dOqlmzpubNm6fixbP+r+7//d//adu2bdfbVZIunaWQmpqqO++802H5Dz/8oAsXLmT4zK6nZ8+eqlKliiIiIvTrr7/q448/lq+vr15//XWrzY3U27t371avXr00aNAgDRgwQDVq1FBSUpL
"text/plain": [
"<Figure size 1200x600 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from sklearn.preprocessing import KBinsDiscretizer, StandardScaler\n",
"from imblearn.over_sampling import RandomOverSampler\n",
"\n",
"label_encoders = {}\n",
"for column in [\"sex\", \"region\",\"smoker\"]: # Категориальные столбцы\n",
" le = LabelEncoder()\n",
" X_reg_train[column] = le.fit_transform(X_reg_train[column])\n",
" label_encoders[column] = le\n",
"\n",
"# Дискретизация целевой переменной (разбиение на интервалы)\n",
"kbins = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile', random_state=222) # 5 бинов\n",
"y_reg_binned = kbins.fit_transform(y_reg_train.values.reshape(-1, 1)).ravel()\n",
"\n",
"# Масштабирование признаков\n",
"scaler = StandardScaler()\n",
"X_reg_train_scaled = scaler.fit_transform(X_reg_train) # Масштабируем данные\n",
"\n",
"# Применение RandomOverSampler для сбалансированного распределения бинов\n",
"ros = RandomOverSampler(random_state=42)\n",
"X_reg_balanced, y_reg_binned_balanced = ros.fit_resample(X_reg_train_scaled, y_reg_binned)\n",
"\n",
"# Преобразование сбалансированных данных обратно в непрерывные значения (если требуется)\n",
"y_reg_balanced = kbins.inverse_transform(y_reg_binned_balanced.reshape(-1, 1)).ravel()\n",
"\n",
"# Проверка форм данных\n",
"print(\"До балансировки:\", X_reg_train.shape, y_reg_train.shape)\n",
"print(\"После балансировки:\", X_reg_balanced.shape, y_reg_balanced.shape)\n",
"\n",
"# Визуализация распределения целевой переменной\n",
"plt.figure(figsize=(12, 6))\n",
"plt.subplot(1, 2, 1)\n",
"plt.hist(y_reg_train, bins=30, alpha=0.7)\n",
"plt.title(\"До балансировки (Charges)\")\n",
"plt.xlabel(\"Charges\")\n",
"plt.ylabel(\"Частота\")\n",
"\n",
"plt.subplot(1, 2, 2)\n",
"plt.hist(y_reg_balanced, bins=30, alpha=0.7, color=\"orange\")\n",
"plt.title(\"После балансировки (Charges)\")\n",
"plt.xlabel(\"Charges\")\n",
"plt.ylabel(\"Частота\")\n",
"\n",
"plt.tight_layout()\n",
"plt.show()\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Отлично, приступаем к конструированию признаков"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"После One-Hot Encoding:\n",
" age bmi children smoker sex_0.9501294624541127 \\\n",
"0 1.266573 0.156259 -0.919952 -0.514431 True \n",
"1 0.475349 0.280861 0.721550 -0.514431 False \n",
"2 -0.531664 -1.137311 -0.099201 -0.514431 False \n",
"3 0.187631 -0.740551 -0.099201 1.943897 True \n",
"4 -1.179029 0.133306 -0.919952 1.943897 False \n",
"\n",
" region_-0.4674261144497567 region_0.43960962455834773 \\\n",
"0 True False \n",
"1 False False \n",
"2 False True \n",
"3 False True \n",
"4 False False \n",
"\n",
" region_1.3466453635664521 \n",
"0 False \n",
"1 False \n",
"2 False \n",
"3 False \n",
"4 True \n",
"После дискретизации:\n",
" age bmi children smoker sex_0.9501294624541127 \\\n",
"0 1.266573 0.156259 -0.919952 -0.514431 True \n",
"1 0.475349 0.280861 0.721550 -0.514431 False \n",
"2 -0.531664 -1.137311 -0.099201 -0.514431 False \n",
"3 0.187631 -0.740551 -0.099201 1.943897 True \n",
"4 -1.179029 0.133306 -0.919952 1.943897 False \n",
"\n",
" region_-0.4674261144497567 region_0.43960962455834773 \\\n",
"0 True False \n",
"1 False False \n",
"2 False True \n",
"3 False True \n",
"4 False False \n",
"\n",
" region_1.3466453635664521 Age_bins BMI_bins \n",
"0 False 4 2 \n",
"1 False 3 2 \n",
"2 False 1 1 \n",
"3 False 2 1 \n",
"4 True 0 2 \n",
"После синтеза признаков:\n",
" age bmi children smoker sex_0.9501294624541127 \\\n",
"0 1.266573 0.156259 -0.919952 -0.514431 True \n",
"1 0.475349 0.280861 0.721550 -0.514431 False \n",
"2 -0.531664 -1.137311 -0.099201 -0.514431 False \n",
"3 0.187631 -0.740551 -0.099201 1.943897 True \n",
"4 -1.179029 0.133306 -0.919952 1.943897 False \n",
"\n",
" region_-0.4674261144497567 region_0.43960962455834773 \\\n",
"0 True False \n",
"1 False False \n",
"2 False True \n",
"3 False True \n",
"4 False False \n",
"\n",
" region_1.3466453635664521 Age_bins BMI_bins Children_BMI Age_BMI_ratio \n",
"0 False 4 2 -0.143751 1.095406 \n",
"1 False 3 2 0.202656 0.371116 \n",
"2 False 1 1 0.112822 3.871955 \n",
"3 False 2 1 0.073463 0.723190 \n",
"4 True 0 2 -0.122635 -1.040345 \n",
"Масштабирование завершено.\n",
"Сгенерированные признаки:\n",
" age bmi children smoker sex_0.9501294624541127 \\\n",
"charge_id \n",
"0 1.266573 0.156259 -0.919952 -0.514431 True \n",
"1 0.475349 0.280861 0.721550 -0.514431 False \n",
"2 -0.531664 -1.137311 -0.099201 -0.514431 False \n",
"3 0.187631 -0.740551 -0.099201 1.943897 True \n",
"4 -1.179029 0.133306 -0.919952 1.943897 False \n",
"\n",
" region_-0.4674261144497567 region_0.43960962455834773 \\\n",
"charge_id \n",
"0 True False \n",
"1 False False \n",
"2 False True \n",
"3 False True \n",
"4 False False \n",
"\n",
" region_1.3466453635664521 Age_bins BMI_bins Children_BMI \\\n",
"charge_id \n",
"0 False 4 2 -0.143751 \n",
"1 False 3 2 0.202656 \n",
"2 False 1 1 0.112822 \n",
"3 False 2 1 0.073463 \n",
"4 True 0 2 -0.122635 \n",
"\n",
" Age_BMI_ratio \n",
"charge_id \n",
"0 1.095406 \n",
"1 0.371116 \n",
"2 3.871955 \n",
"3 0.723190 \n",
"4 -1.040345 \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\repos\\AIM\\AIM-PIbd-31-Sagirov-M-M\\aimenv\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:1733: UserWarning: index charge_id not found in dataframe, creating new integer column\n",
" warnings.warn(\n",
"c:\\repos\\AIM\\AIM-PIbd-31-Sagirov-M-M\\aimenv\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n",
" warnings.warn(\n"
]
}
],
"source": [
"import featuretools as ft\n",
"# One-Hot Encoding для категориальных признаков\n",
"categorical_columns = [\"sex\", \"region\"]\n",
"X_reg_balanced_df = pd.DataFrame(X_reg_balanced, columns=X_reg_train.columns)\n",
"X_encoded = pd.get_dummies(X_reg_balanced_df, columns=categorical_columns, drop_first=True)\n",
"\n",
"print(\"После One-Hot Encoding:\")\n",
"print(X_encoded.head())\n",
"\n",
"# Дискретизация числовых признаков\n",
"X_encoded[\"Age_bins\"] = pd.cut(X_encoded[\"age\"], bins=5, labels=False)\n",
"X_encoded[\"BMI_bins\"] = pd.cut(X_encoded[\"bmi\"], bins=5, labels=False)\n",
"\n",
"print(\"После дискретизации:\")\n",
"print(X_encoded.head())\n",
"\n",
"# Добавление новых признаков\n",
"X_encoded[\"Children_BMI\"] = X_encoded[\"children\"] * X_encoded[\"bmi\"] # Произведение детей и BMI\n",
"X_encoded[\"Age_BMI_ratio\"] = X_encoded[\"age\"] / (X_encoded[\"bmi\"] + 1) # Возраст на индекс массы тела\n",
"\n",
"print(\"После синтеза признаков:\")\n",
"print(X_encoded.head())\n",
"\n",
"from sklearn.preprocessing import MinMaxScaler, StandardScaler\n",
"\n",
"# Нормализация\n",
"scaler_minmax = MinMaxScaler()\n",
"X_normalized = scaler_minmax.fit_transform(X_encoded)\n",
"\n",
"# Стандартизация\n",
"scaler_standard = StandardScaler()\n",
"X_standardized = scaler_standard.fit_transform(X_encoded)\n",
"\n",
"print(\"Масштабирование завершено.\")\n",
"\n",
"# Создание объекта EntitySet\n",
"es = ft.EntitySet(id=\"charge_data\")\n",
"es = es.add_dataframe(dataframe_name=\"charge_data\", dataframe=X_encoded, index=\"charge_id\")\n",
"\n",
"# Применяем deep feature synthesis для создания новых признаков\n",
"features, feature_names = ft.dfs(entityset=es, target_dataframe_name=\"charge_data\", max_depth=2)\n",
"\n",
"print(\"Сгенерированные признаки:\")\n",
"print(features.head())\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Появляются предупреждения, о том что в EntitySet только один датафрейм, игнорируем\n",
"\n",
"Делаем то же самое для другого датасета"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"После One-Hot Encoding:\n",
" age bmi children sex_1 region_1 region_2 region_3\n",
"0 41 32.200 1 False False False True\n",
"1 64 22.990 0 False False True False\n",
"2 54 27.645 1 False True False False\n",
"3 28 26.315 3 False True False False\n",
"4 27 30.400 3 False True False False\n",
"После дискретизации:\n",
" age bmi children sex_1 region_1 region_2 region_3 Age_bins \\\n",
"0 41 32.200 1 False False False True 2 \n",
"1 64 22.990 0 False False True False 4 \n",
"2 54 27.645 1 False True False False 3 \n",
"3 28 26.315 3 False True False False 1 \n",
"4 27 30.400 3 False True False False 0 \n",
"\n",
" BMI_bins \n",
"0 2 \n",
"1 0 \n",
"2 1 \n",
"3 1 \n",
"4 1 \n",
"После синтеза признаков:\n",
" age bmi children sex_1 region_1 region_2 region_3 Age_bins \\\n",
"0 41 32.200 1 False False False True 2 \n",
"1 64 22.990 0 False False True False 4 \n",
"2 54 27.645 1 False True False False 3 \n",
"3 28 26.315 3 False True False False 1 \n",
"4 27 30.400 3 False True False False 0 \n",
"\n",
" BMI_bins Age_BMI Age_BMI_ratio \n",
"0 2 1320.20 1.273292 \n",
"1 0 1471.36 2.783819 \n",
"2 1 1492.83 1.953337 \n",
"3 1 736.82 1.064032 \n",
"4 1 820.80 0.888158 \n",
" age bmi children sex_1 region_1 region_2 region_3 \\\n",
"0 0.500000 0.443474 1 False False False True \n",
"1 1.000000 0.191972 0 False False True False \n",
"2 0.782609 0.319088 1 False True False False \n",
"3 0.217391 0.282769 3 False True False False \n",
"4 0.195652 0.394320 3 False True False False \n",
"\n",
" Age_bins BMI_bins Age_BMI Age_BMI_ratio \n",
"0 2 2 0.403768 0.303438 \n",
"1 4 0 0.462857 0.829753 \n",
"2 3 1 0.471249 0.540387 \n",
"3 1 1 0.175725 0.230526 \n",
"4 0 1 0.208553 0.169246 \n",
"Сгенерированные признаки:\n",
" age bmi children sex_1 region_1 region_2 region_3 \\\n",
"smoker_id \n",
"0 0.500000 0.443474 1 False False False True \n",
"1 1.000000 0.191972 0 False False True False \n",
"2 0.782609 0.319088 1 False True False False \n",
"3 0.217391 0.282769 3 False True False False \n",
"4 0.195652 0.394320 3 False True False False \n",
"\n",
" Age_bins BMI_bins Age_BMI Age_BMI_ratio \n",
"smoker_id \n",
"0 2 2 0.403768 0.303438 \n",
"1 4 0 0.462857 0.829753 \n",
"2 3 1 0.471249 0.540387 \n",
"3 1 1 0.175725 0.230526 \n",
"4 0 1 0.208553 0.169246 \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\repos\\AIM\\AIM-PIbd-31-Sagirov-M-M\\aimenv\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:1733: UserWarning: index smoker_id not found in dataframe, creating new integer column\n",
" warnings.warn(\n",
"c:\\repos\\AIM\\AIM-PIbd-31-Sagirov-M-M\\aimenv\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n",
" warnings.warn(\n"
]
}
],
"source": [
"import featuretools as ft\n",
"from sklearn.preprocessing import MinMaxScaler, StandardScaler\n",
"# One-Hot Encoding для категориальных признаков\n",
"categorical_columns = [\"sex\", \"region\"]\n",
"X_clf_balanced_df = pd.DataFrame(X_clf_train_balanced, columns=X_clf_train.columns)\n",
"X_encoded = pd.get_dummies(X_clf_balanced_df, columns=categorical_columns, drop_first=True)\n",
"\n",
"print(\"После One-Hot Encoding:\")\n",
"print(X_encoded.head())\n",
"\n",
"# Дискретизация числовых признаков\n",
"X_encoded[\"Age_bins\"] = pd.cut(X_encoded[\"age\"], bins=5, labels=False)\n",
"X_encoded[\"BMI_bins\"] = pd.cut(X_encoded[\"bmi\"], bins=5, labels=False)\n",
"\n",
"print(\"После дискретизации:\")\n",
"print(X_encoded.head())\n",
"\n",
"# Синтез новых признаков\n",
"X_encoded[\"Age_BMI\"] = X_encoded[\"age\"] * X_encoded[\"bmi\"] \n",
"X_encoded[\"Age_BMI_ratio\"] = X_encoded[\"age\"] / (X_encoded[\"bmi\"] + 1e-6) #чтоб не бесконечность\n",
"\n",
"print(\"После синтеза признаков:\")\n",
"print(X_encoded.head())\n",
"\n",
"\n",
"# Масштабирование признаков: Стандартизация\n",
"scaler = StandardScaler()\n",
"X_encoded[['age', 'bmi', 'Age_BMI', 'Age_BMI_ratio']] = scaler.fit_transform(X_encoded[['age', 'bmi', 'Age_BMI', 'Age_BMI_ratio']])\n",
"\n",
"# Масштабирование признаков: Нормализация\n",
"scaler_minmax = MinMaxScaler()\n",
"X_encoded[['age', 'bmi', 'Age_BMI', 'Age_BMI_ratio']] = scaler_minmax.fit_transform(X_encoded[['age', 'bmi', 'Age_BMI', 'Age_BMI_ratio']])\n",
"\n",
"# Проверка после масштабирования\n",
"print(X_encoded.head())\n",
"\n",
"\n",
"# Создание объекта EntitySet\n",
"es = ft.EntitySet(id=\"smoker_data\")\n",
"es = es.add_dataframe(dataframe_name=\"smoker_data\", dataframe=X_encoded, index=\"smoker_id\")\n",
"\n",
"# Применяем deep feature synthesis для создания новых признаков\n",
"features, feature_names = ft.dfs(entityset=es, target_dataframe_name=\"smoker_data\", max_depth=2)\n",
"\n",
"print(\"Сгенерированные признаки:\")\n",
"print(features.head())\n",
"\n",
"\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Т е же самые предупреждения, не обращаем внимания. Приступаем к оценке"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy: 0.8882882882882883\n",
"ROC-AUC: 0.8528610899771755\n",
"Время обучения модели: 0.2450 секунд\n",
"Время предсказания: 0.0123 секунд\n",
"Средняя точность по кросс-валидации: 0.9989\n",
"Корреляция признаков с целевой переменной:\n",
"smoker 1.000000\n",
"sex 0.082326\n",
"bmi 0.011489\n",
"children 0.006362\n",
"region -0.006751\n",
"age -0.023286\n",
"Name: smoker, dtype: float64\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAvwAAAKqCAYAAABGj4plAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACuwklEQVR4nOzdd3hT5dsH8G+apElnmu5SCqVQKKVAEWTI3ku2CIgyRJSlMmTUVXABgggiCrJRf68IgqJMmcqSvUuhm+6927RNzvtHJSU0DW0oLYnfz3XlutrT+5w855wmuXOf53mOSBAEAUREREREZJYsarsBRERERET05DDhJyIiIiIyY0z4iYiIiIjMGBN+IiIiIiIzxoSfiIiIiMiMMeEnIiIiIjJjTPiJiIiIiMwYE34iIiIiIjPGhJ+IiIiIyIwx4SciIiIiMmNM+MksbdmyBSKRCBcuXCj3t/Xr10MkEmHo0KFQq9W10DoiIiKimsOEn/5Tdu/ejalTp6Jz58746aefIBaLa7tJRERERE8UE376zzh+/DjGjBkDf39//P7775DL5bXdJCIiIqInjgk//SdcuXIFQ4YMgYeHBw4ePAiFQlEuZseOHWjdujWsrKzg7OyMl19+GXFxcToxEyZMgK2tLSIiItC3b1/Y2NigTp06+OijjyAIgjYuKioKIpEIy5cvx5dffon69evDysoKXbt2xY0bN8o99+3bt/HCCy/A0dERcrkcbdq0wZ49e/TuS7du3SASico9tmzZohP37bffIiAgANbW1jpxO3fu1NlWQEBAuedYvnw5RCIRoqKitMvud5N6cJlGo0GLFi30Pv/Ro0fRuXNn2NjYwMHBAUOGDEFISIhOzMKFCyESiZCamqqz/MKFC+W2ef/YP2znzp0QiUQ4fvy4dtnff/+NkSNHol69epDJZPDy8sKsWbNQUFCgd/02bdrAzs5O5zgtX768XOyD7h8PS0tLpKSk6PztzJkz2u082K2sMu2aMGGC3vP74OP+OfD29sbzzz+PQ4cOITAwEHK5HP7+/ti1a5fetlbm3FXlOBcVFeHDDz9E69atoVAoYGNjg86dO+PYsWMGj9193t7eBvfzQSKRCDNmzMCPP/6IJk2aQC6Xo3Xr1vjrr7904u7/Tz0oNzcX7u7u5do/ZcoU+Pr6wtraGo6OjujRowf+/vvvcm18/vnny7V9xowZ5Z5n8+bN6NGjB1xdXSGTyeDv749vv/1W735PmDBBZ9nrr78OuVyu0z4A+Oabb9CsWTPIZDLUqVMH06dPR2Zmpk7Mw+8Jzs7OGDhwoN73GiL6b5LUdgOInrTw8HD069cPMpkMBw8ehIeHR7mYLVu2YOLEiXj22WexePFiJCUlYdWqVTh16hQuX74MBwcHbaxarUa/fv3Qvn17fP755zhw4ACCg4NRUlKCjz76SGe727ZtQ05ODqZPn47CwkKsWrUKPXr0wPXr1+Hm5gYAuHnzJjp27AhPT08sWLAANjY2+PnnnzF06FD88ssvGDZsWLn2+vn54b333gMApKamYtasWTp/3759O6ZNm4Zu3brhzTffhI2NDUJCQvDZZ5897uHU8f333+P69evllh8+fBj9+/eHj48PFi5ciIKCAqxevRodO3bEpUuX4O3tXa3teNiOHTuQn5+PqVOnwsnJCefOncPq1asRGxuLHTt2aOPOnDmDF198ES1btsSSJUugUCj0Hk9DxGIxfvjhB511Nm/eDLlcjsLCwiq364033kCvXr2067zyyisYNmwYhg8frl3m4uKi/fnu3bsYNWoUpkyZgvHjx2Pz5s0YOXIkDhw4gN69e1fY7orOXVVkZ2djw4YNGDNmDCZPnoycnBxs3LgRffv2xblz5xAYGPjIbQQGBmLOnDk6y7Zt24Y///yzXOyJEyewfft2vPXWW5DJZPjmm2/Qr18/nDt3Tu8X1/u++OILJCUllVteVFSEl19+GXXr1kV6ejrWrVuHfv36ISQkBPXq1Xv0AXjIt99+i2bNmmHw4MGQSCT4/fffMW3aNGg0GkyfPr3C9YKDg7Fx40Zs374d3bp10y5fuHAhFi1ahF69emHq1KkIDQ3Ft99+i/Pnz+PUqVOQSqXa2PvvCYIgIDw8HCtWrMCAAQMQExNT5f0gIjMkEJmhzZs3CwCEP/74Q2jYsKEAQOjTp4/e2KKiIsHV1VUICAgQCgoKtMv/+OMPAYDw4YcfapeNHz9eACC8+eab2mUajUYYOHCgYGlpKaSkpAiCIAiRkZECAMHKykqIjY3Vxv7zzz8CAGHWrFnaZT179hSaN28uFBYW6mzzueeeE3x9fcu1t2PHjkL37t21v99/rs2bN2uXjRkzRnBwcNDZn2PHjgkAhB07dmiXde3aVWjWrFm551i2bJkAQIiMjNQuu39M7y8rLCwU6tWrJ/Tv37/c8wcGBgqurq5CWlqadtnVq1cFCwsLYdy4cdplwcHBAgDtcbvv/Pnz5bY5fvx4wcbGplxbd+zYIQAQjh07pl2Wn59fLm7x4sWCSCQSoqOjtcuCgoIEAEJCQoJ22f3juWzZsnLbeND94zFmzBihefPm2uV5eXmCvb298NJLLwkAhPPnz1e5XQ8CIAQHB+v9W/369QUAwi+//KJdlpWVJXh4eAitWrUq19bKnLuqHOeSkhJBpVLpxGVkZAhubm7Cq6++qrfND7d/4MCB5ZZPnz5dePjjCYAAQLhw4YJ2WXR0tCCXy4Vhw4Zpl93/n7ovOTlZsLOz0+7rg+1/2Llz5wQAws6dO41qo77z27dvX8HHx0dnWf369YXx48cLgiAI69atEwAIq1ev1olJTk4WLC0thT59+ghqtVq7/OuvvxYACJs2bdIu69q1q9C1a1ed9d99910BgJCcnFzh/hLRfwe79JBZmzBhAu7du4eXXnoJhw4d0qnu3nfhwgUkJydj2rRpOv36Bw4cCD8/P+zdu7fcOjNmzND+fL+rQVFREQ4fPqwTN3ToUHh6emp/b9u2Ldq1a4d9+/YBANLT03H06FG8+OKLyMnJQWpqKlJTU5GWloa+ffvi7t275boVFRUVQSaTGdzvnJwcWFtbP9FxCmvWrEFaWhqCg4N1lickJODKlSuYMGECHB0dtctbtGiB3r17a/f9Qenp6dp9T01NRVZWVoXP+2BcamoqcnJyysVYWVlpf87Ly0Nqaiqee+45CIKAy5cva/+Wk5MDCwsLnSs4VfXKK6/g9u3b2q47v/zyCxQKBXr27Gl0u6qiTp06OleB7O3tMW7cOFy+fBmJiYl616no3FWVWCyGpaUlgNIuQunp6SgpKUGbNm1w6dKlx9q2Ph06dEDr1q21v9erVw9DhgzBwYMHK5xx6+OPP4ZCocBbb72l9++FhYVITU1FSEgIVq1aBSsrK7Rp00Ynpri4uNz/3cNXbwDd85uVlYXU1FR07doVERERev+nf/vtN0ybNg1z587VeU8BSq+SFRUVYebMmbCwKPuonjx5Muzt7cu9L91vY0pKCs6cOYPdu3ejRYsWcHZ21rvfRPTfwoSfzFp6ejp++OEHbN26FYGBgXj77bfLffBGR0cDAJo0aVJufT8/P+3f77OwsICPj4/OssaNGwOATh9pAPD19S23zcaNG2vjwsLCIAgCPvjgA7i4uOg87idjycnJOutnZmbq7WP9oA4dOiA+Ph4LFy5ETEzMI5PoqsrKysJnn32G2bNna7sm3WfoeDZt2hSpqanIy8vTWd6kSROdfX+wS8uD8vLyyh2nV199tVxcTEyM9guHra0tXFxc0LVrV23b7+vQoQM0Gg3efvtthIeHIzU1FRkZGVU6Fi4uLhg4cCA2bdoEANi0aRPGjx+vk6RVtV1V0ahRo3J9ySv6f7z/PBWdO2Ns3boVLVq0gFwuh5OTE1xcXLB3795q/X+7r6LXU35+frlxFAAQGRmJdevWYdGiRRV++d2yZQtcXFzg7++PI0eO4M8//0T9+vV1Yg4dOlTu/27jxo3ltnXq1Cn06tVLO27FxcUF7777LoDy5/fKlSsYM2YM1Go10tPTy22roteRpaUlfHx8yr0vnT59Gi4uLnB1dcV
"text/plain": [
"<Figure size 1000x800 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" no 0.91 0.96 0.93 442\n",
" yes 0.79 0.62 0.69 113\n",
"\n",
" accuracy 0.89 555\n",
" macro avg 0.85 0.79 0.81 555\n",
"weighted avg 0.88 0.89 0.88 555\n",
"\n"
]
}
],
"source": [
"import time\n",
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.model_selection import train_test_split, cross_val_score\n",
"from sklearn.metrics import accuracy_score, roc_auc_score, classification_report\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from sklearn.preprocessing import LabelEncoder\n",
"\n",
"# Кодирование категориальных признаков\n",
"label_encoders = {}\n",
"for column in [\"sex\", \"region\"]: # Категориальные столбцы\n",
" le = LabelEncoder()\n",
" X_clf_test[column] = le.fit_transform(X_clf_test[column])\n",
" X_clf_train[column] = le.fit_transform(X_clf_train[column])\n",
" X_clf[column] = le.fit_transform(X_clf[column])\n",
" label_encoders[column] = le\n",
"\n",
"# 1. Оценка предсказательной способности (Accuracy и ROC-AUC для бинарной классификации)\n",
"model = RandomForestClassifier(random_state=42)\n",
"\n",
"start_time = time.perf_counter()\n",
"model.fit(X_clf_train, y_clf_train)\n",
"end_time = time.perf_counter()\n",
"\n",
"train_time = end_time - start_time\n",
"\n",
"y_pred = model.predict(X_clf_test)\n",
"accuracy = accuracy_score(y_clf_test, y_pred)\n",
"roc_auc = roc_auc_score(y_clf_test, model.predict_proba(X_clf_test)[:, 1])\n",
"\n",
"print(f\"Accuracy: {accuracy}\")\n",
"print(f\"ROC-AUC: {roc_auc}\")\n",
"print(f\"Время обучения модели: {train_time:.4f} секунд\")\n",
"\n",
"# 2. Оценка скорости вычисления (время предсказания)\n",
"start_time = time.perf_counter()\n",
"y_pred = model.predict(X_clf_test)\n",
"end_time = time.perf_counter()\n",
"predict_time = end_time - start_time\n",
"\n",
"print(f\"Время предсказания: {predict_time:.4f} секунд\")\n",
"\n",
"# 3. Оценка надежности модели с помощью перекрестной проверки\n",
"cv_scores = cross_val_score(model, X_clf, y_clf, cv=5, scoring='accuracy')\n",
"mean_cv_score = np.mean(cv_scores)\n",
"print(f\"Средняя точность по кросс-валидации: {mean_cv_score:.4f}\")\n",
"\n",
"# 4. Оценка корреляции признаков с целевой переменной\n",
"# Сначала нужно закодировать целевую переменную, если она не числовая\n",
"if y_clf.dtypes == 'object':\n",
" y_clf = LabelEncoder().fit_transform(y_clf)\n",
"if isinstance(y_clf, np.ndarray):\n",
" y_clf = pd.Series(y_clf)\n",
"\n",
"# Преобразуем y_clf в DataFrame с названием 'smoker'\n",
"y_clf = pd.Series(y_clf, name='smoker')\n",
"\n",
"# Объединяем X_clf с y_clf\n",
"correlation_matrix = pd.concat([X_clf, y_clf], axis=1).corr()\n",
"\n",
"# Корреляция признаков с целевой переменной (столбец 'smoker')\n",
"correlation_with_target = correlation_matrix['smoker'].sort_values(ascending=False)\n",
"print(\"Корреляция признаков с целевой переменной:\")\n",
"print(correlation_with_target)\n",
"\n",
"# Визуализация корреляции\n",
"plt.figure(figsize=(10, 8))\n",
"sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')\n",
"plt.title('Корреляционная матрица признаков')\n",
"plt.show()\n",
"\n",
"# Дополнительная информация о модели\n",
"print(classification_report(y_clf_test, y_pred))\n"
]
},
{
"cell_type": "markdown",
2024-11-29 21:18:24 +04:00
"metadata": {},
2024-11-30 09:21:24 +04:00
"source": [
"Вроде всё, мораль такова - используйте ctrl+s"
]
2024-11-29 21:18:24 +04:00
}
],
"metadata": {
2024-11-30 09:21:24 +04:00
"kernelspec": {
"display_name": "aimenv",
"language": "python",
"name": "python3"
},
2024-11-29 21:18:24 +04:00
"language_info": {
2024-11-30 09:21:24 +04:00
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
2024-11-29 21:18:24 +04:00
}
},
"nbformat": 4,
"nbformat_minor": 2
}