1444 lines
392 KiB
Plaintext
1444 lines
392 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"https://www.kaggle.com/datasets/harishkumardatalab/medical-insurance-price-prediction Набор представляет собой данные о мед страховке.\n",
|
|||
|
"Пример цели: Рассчитать стоимость будущей страховки\n",
|
|||
|
"Входные данные: возраст, пол, индекс массы тела, дети, курит ли, регион, стоимость"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 108,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"df = pd.read_csv(\".//static//csv//Medical_insurance.csv\")\n",
|
|||
|
"print(df.columns)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 109,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAx8AAAIjCAYAAABia6bHAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAkA0lEQVR4nO3deXRW9Z348U/CElKWoKhAFBAVQa2IVbRI+VHccatVq3V0FJcZp8WClNaWYxVq3U7RcR0XqgIdl1ac1mXUIiri2FrXQbFURMEVxLoQIgMK5P7+6OE5pigChk8IvF7n5NTc+32ST+jXhLf3uU/KiqIoAgAAYD0rb+wBAACATYP4AAAAUogPAAAghfgAAABSiA8AACCF+AAAAFKIDwAAIIX4AAAAUogPAAAghfgA2AS89tprUVZWFhMmTGjsUQDYhIkPgLUwYcKEKCsrq/e21VZbxaBBg+KBBx5In+fRRx+tN0uLFi1iu+22i5NOOinmzJnTIJ/jT3/6U4wZMyYWLlzYIB8PgE1X88YeAKApOv/886N79+5RFEUsWLAgJkyYEIccckjce++9cdhhh6XPM2zYsOjbt28sW7YsnnvuuRg3blzcd999MWPGjKiurv5SH/tPf/pT/PznP48hQ4ZE+/btG2ZgADZJ4gNgHQwePDj23HPP0vunnXZadOzYMW6//fZGiY8BAwbEMcccExERp5xySuy4444xbNiwmDhxYowaNSp9HgD4LJ52BdAA2rdvH5WVldG8ef3/prN48eIYOXJkdOnSJSoqKqJnz55x6aWXRlEUERGxZMmS6NWrV/Tq1SuWLFlSetwHH3wQnTt3jn322SdWrFix1vPsu+++ERExd+7c1a575JFHYsCAAdG6deto3759fOtb34q//vWvpfNjxoyJH//4xxER0b1799LTu1577bUvnOEfn5628u3RRx9dZe2QIUM+c+2YMWPqrbvzzjtjzz33jLZt29Zbd+mll652lg8++CB+9KMfxa677hpt2rSJdu3axeDBg+P5559fZe3rr78eRxxxRLRu3Tq22mqrGDFiREyePPkzZ3/yySfj4IMPjqqqqvjKV74SAwcOjD/+8Y9f+GcDsKly5QNgHdTU1MR7770XRVHEu+++G1dffXV89NFHceKJJ5bWFEURRxxxREydOjVOO+206NOnT0yePDl+/OMfx9tvvx2XX355VFZWxsSJE6N///5xzjnnxL//+79HRMTQoUOjpqYmJkyYEM2aNVvr+V599dWIiOjQocPnrnnooYdi8ODBsd1228WYMWNiyZIlcfXVV0f//v3jueeei2233TaOOuqoePnll+P222+Pyy+/PLbYYouIiNhyyy3XaI4DDjggTjrppIiIePrpp+Oqq6763LVbbLFFXH755aX3//mf/7ne+SeeeCKOPfbY2G233eKSSy6JqqqqeO+992LEiBFfOMecOXPirrvuiu985zvRvXv3WLBgQdxwww0xcODAmDlzZumpaYsXL45999035s+fH8OHD49OnTrFbbfdFlOnTl3lYz7yyCMxePDg2GOPPWL06NFRXl4e48ePj3333Tf+53/+J/baa681+jMC2KQUAKyx8ePHFxGxyltFRUUxYcKEemvvuuuuIiKKCy64oN7xY445pigrKyteeeWV0rFRo0YV5eXlxWOPPVZMmjSpiIjiiiuu+MJ5pk6dWkREcfPNNxd/+9vfinnz5hX33Xdfse222xZlZWXF008/XRRFUcydO7eIiGL8+PGlx/bp06fYaqutivfff7907Pnnny/Ky8uLk046qXRs7NixRUQUc+fOXeM/p08++aSIiOLMM88sHVv5dU2dOnWV9SeccELRvXv3esciohg9enTp/VGjRhURUcyfP790bOXXNXbs2NXOs3Tp0mLFihX1js2dO7eoqKgozj///NKxyy67rIiI4q677iodW7JkSdGrV696s9fV1RU9evQoDjrooKKurq609v/+7/+K7t27FwcccMBq5wHYVHnaFcA6+I//+I+YMmVKTJkyJW655ZYYNGhQnH766fG73/2utOb++++PZs2axbBhw+o9duTIkVEURb1XxxozZkzssssucfLJJ8f3v//9GDhw4CqPW51TTz01ttxyy6iuro5DDz00Fi9eHBMnTqx3X8qnzZ8/P6ZPnx5DhgyJzTffvHS8d+/eccABB8T999+/xp/7syxdujQiIlq1arVG6z/55JOoqKhY7Zra2tooLy9fp5veKyoqorz87z/yVqxYEe+//360adMmevbsGc8991xp3R/+8IfYeuut44gjjigda9WqVfzLv/xLvY83ffr0mD17dvzTP/1TvP/++/Hee+/Fe++9F4sXL4799tsvHnvssairq1vrOQE2dp52BbAO9tprr3p/sT/++ONj9913jzPPPDMOO+ywaNmyZbz++utRXV0dbdu2rffYnXbaKSL+fm/BSi1btoybb745+vbtG61atYrx48dHWVnZGs9z3nnnxYABA6JZs2axxRZbxE477bTK/SeftvJz9+zZc5VzO+20U0yePDkWL14crVu3XuMZPu29996LiIiqqqo1Wr9w4cJo06bNatf069cvrrnmmhg+fHicffbZUVVVFR9++OEaffy6urq48sor49prr425c+fWu4/m009Ne/3112P77bdf5c9+hx12qPf+7NmzIyLi5JNP/tzPWVNTE5ttttkazQewqRAfAA2gvLw8Bg0aFFdeeWXMnj07dtlll7X+GJMnT46Iv181mD17dnTv3n2NH7vrrrvG/vvvv9afc31ZeUP6tttuu0br33nnnejWrdtq13z3u9+N5557Lq6++uoYN27cWs1z0UUXxbnnnhunnnpq/OIXv4jNN988ysvL46yzzlqnKxQrHzN27Njo06fPZ675opgC2BSJD4AGsnz58oiI+OijjyIiolu3bvHQQw9FbW1tvasfL730Uun8Si+88EKcf/75ccopp8T06dPj9NNPjxkzZqzxlYO1tfJzz5o1a5VzL730UmyxxRalqx5rcwVmpWeeeSYi4nOf9vVpy5Yti1deeSUOPvjg1a4rLy+PSy+9NGbMmBFz586Na6+9NhYsWFDvJv/Pc+edd8agQYPipptuqnd84cKFpZvoI/7+5zJz5swoiqLe1/3KK6/Ue9z2228fERHt2rXboKIPYEPnng+ABrBs2bJ48MEHo2XLlqWnVR1yyCGxYsWKuOaaa+qtvfzyy6OsrCwGDx5ceuyQIUOiuro6rrzyypgwYUIsWLBgjV7FaV117tw5+vTpExMnTqz3m8tffPHFePDBB+OQQw4pHVsZIWvzG87vvPPO6NmzZ/Tq1esL1959992xZMmS0ssDr87VV18djzzySNx6662x//77R//+/ddonmbNmpVe3nilSZMmxdtvv13v2EEHHRRvv/123HPPPaVjS5cujV/96lf11u2xxx6x/fbbx6WXXlqKzU/729/+tkZzAWxqXPkAWAcPPPBA6QrGu+++G7fddlvMnj07fvrTn0a7du0iIuLwww+PQYMGxTnnnBOvvfZa7LbbbvHggw/G3XffHWeddVbpv55fcMEFMX369Hj44Yejbdu20bt37zjvvPPiZz/7WRxzzDH1QqAhjR07NgYPHhz9+vWL0047rfRSu1VVVfV+v8Yee+wRERHnnHNOfPe7340WLVrE4Ycf/pn3g8yZMyd++ctfxlNPPRVHHXVU3HLLLaVzTz/9dERETJkyJbp27RqdOnWK0aNHx7XXXhv77LNPHHjggaud9y9/+UucffbZMWbMmOjbt+9afa2HHXZY6crSPvvsEzNmzIhbb701tttuu3rrzjjjjLjmmmvi+OOPj+HDh0fnzp3j1ltvLd04v/JqSHl5edx4440xePDg2GWXXeKUU06JrbfeOt5+++2YOnVqtGvXLu699961mhFgk9DIr7YF0KR81kvttmrVqujTp09x3XXX1XvZ1aIoitra2mLEiBFFdXV10aJFi6JHjx7F2LFjS+ueffbZonnz5sUPfvCDeo9bvnx50bdv36K6urr48MMPP3eelS+1O2nSpNXO/VkvtVs
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA2QAAAIjCAYAAABswtioAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAC3PUlEQVR4nOzdeXgUVdo28Dsh6SQQOiFACGtYXJIYQISAUYPLABHigssYlwFEBtCXMCM4LvkkiATNjI4vOIILyIg68II6qEjCNqDCQMTIImICLiwRMYQ1IUhoQvf3R6Z7SFLdfbpOpauq+/5dl9cl3U91Vbqrq89T55znhDgcDgeIiIiIiIjI70L1PgAiIiIiIqJgxYSMiIiIiIhIJ0zIiIiIiIiIdMKEjIiIiIiISCdMyIiIiIiIiHTChIyIiIiIiEgnTMiIiIiIiIh0woSMiIiIiIhIJ0zIiIiIiIiIdMKEjIiIDOvAgQMICQnBokWL9D6UBlavXo0rr7wSkZGRCAkJwalTp/Q+JNO64YYbkJqaqvdhEBHphgkZEZEOvvnmG9x9991ITExEZGQkOnfujKFDh+KVV15ptn0uWbIEc+bMafL44cOHMWPGDOzcubPZ9t3YZ599hpCQENd/4eHh6NmzJ0aPHo19+/Zpso8tW7ZgxowZmidLx48fxz333IOoqCjMmzcP7777Llq1auV1u1dffRUhISEYNGiQpsdjVNXV1Xj22WfRt29fREdHIyoqCqmpqXjyySdx+PBhvQ+PiMgwwvQ+ACKiYLNlyxbceOON6NatG8aPH4+EhAT89NNP+OKLL/Dyyy9j8uTJzbLfJUuWYPfu3Xj00UcbPH748GE8++yz6N69O6688spm2bc7f/jDH5CWlobz589j+/btmD9/PgoLC/HNN9+gU6dOUq+9ZcsWPPvss3jwwQcRGxurzQEDKCkpwenTp5Gfn48hQ4YIb7d48WJ0794dX375JX744Qdccsklmh2T0ezbtw9DhgxBeXk5fvvb32LChAmwWCzYtWsXFi5ciA8//BDfffed3odJRGQITMiIiPzsueeeQ0xMDEpKSpokCpWVlfocVDM4c+aM156jjIwM3H333QCAsWPH4rLLLsMf/vAHvP3228jNzfXHYfrM+Rn5kuTt378fW7ZswfLlyzFx4kQsXrwYzzzzTDMdob7q6upw55134siRI/jss89w3XXXNXj+ueeew1/+8he/H5PdbofFYvHrfomIRHDIIhGRn/3444+44oorFBv08fHxTR77xz/+gYEDB6Jly5Zo06YNBg8ejLVr17qe//jjj5GVlYVOnTohIiICvXr1Qn5+Pi5cuOCKueGGG1BYWIiDBw+6hgl2794dn332GdLS0gDUJ0TO5y6es7V161bcfPPNiImJQcuWLXH99ddj8+bNDY5xxowZCAkJQWlpKe6//360adOmSUNcxE033QSgPoHxZMOGDcjIyECrVq0QGxuL22+/HWVlZQ2O5/HHHwcA9OjRw/V3HThwwOPrvv/+++jfvz+ioqLQrl07/O53v8PPP//sev6GG27AmDFjAABpaWkICQnBgw8+6PXvWrx4Mdq0aYOsrCzcfffdWLx4sWLc8ePHMWrUKFitVsTGxmLMmDH4+uuvFefR7dmzB3fffTfi4uIQGRmJAQMGYMWKFR6P4/z584iLi8PYsWObPFddXY3IyEj86U9/cj32yiuv4IorrnCdewMGDMCSJUs87uOf//wnvv76azz99NOK54DVasVzzz3X5PHS0lLceOONaNmyJTp37owXXnihwfM2mw3Tp09H//79ERMTg1atWiEjIwOffvppgzjnvMO//vWvmDNnDnr16oWIiAiUlpYCqB8uO2DAAERGRqJXr1544403XOdvY//4xz9c50NcXBzuvfde/PTTTx7/fiIiX7GHjIjIzxITE1FcXIzdu3d7LWbw7LPPYsaMGbjmmmswc+ZMWCwWbN26FRs2bMCwYcMAAIsWLUJ0dDSmTp2K6OhobNiwAdOnT0d1dTVefPFFAMDTTz+NqqoqHDp0CLNnzwYAREdHIzk5GTNnzsT06dMxYcIEZGRkAACuueYaAPWJz/Dhw9G/f38888wzCA0NxVtvvYWbbroJmzZtwsCBAxsc729/+1tceumleP755+FwOHx+b3788UcAQNu2bd3G/Otf/8Lw4cPRs2dPzJgxA2fPnsUrr7yCa6+9Ftu3b0f37t1x55134rvvvsP//d//Yfbs2WjXrh0AoH379m5fd9GiRRg7dizS0tJQUFCAI0eO4OWXX8bmzZuxY8cOxMbG4umnn8bll1+O+fPnY+bMmejRowd69erl9e9avHgx7rzzTlgsFtx333147bXXUFJS4kqGAcBut+PWW2/Fl19+iUceeQRJSUn4+OOPXQngxb799ltce+216Ny5M5566im0atUK7733HkaOHIl//vOfuOOOOxSPIzw8HHfccQeWL1+ON954o0GP0UcffYRz587h3nvvBQAsWLAAf/jDH3D33Xfjj3/8I2pra7Fr1y5s3boV999/v9u/1ZkUjho1yuv74nTy5EncfPPNuPPOO3HPPffggw8+wJNPPonevXtj+PDhAOoTxjfffBP33Xcfxo8fj9OnT2PhwoXIzMzEl19+2WS47VtvvYXa2lpMmDABERERiIuLw44dO3DzzTejY8eOePbZZ3HhwgXMnDlT8bx47rnnkJeXh3vuuQe///3vcfToUbzyyisYPHiw63wgItKEg4iI/Grt2rWOFi1aOFq0aOFIT093PPHEE441a9Y4bDZbg7jvv//eERoa6rjjjjscFy5caPCc3W53/f+vv/7aZB8TJ050tGzZ0lFbW+t6LCsry5GYmNgktqSkxAHA8dZbbzXZx6WXXurIzMxssr8ePXo4hg4d6nrsmWeecQBw3HfffULvwaeffuoA4Pj73//uOHr0qOPw4cOOwsJCR/fu3R0hISGOkpISh8PhcOzfv7/JsV155ZWO+Ph4x/Hjx12Pff31147Q0FDH6NGjXY+9+OKLDgCO/fv3ez0em83miI+Pd6SmpjrOnj3renzlypUOAI7p06e7HnvrrbccAFzH6M1XX33lAOBYt26dw+Gof1+7dOni+OMf/9gg7p///KcDgGPOnDmuxy5cuOC46aabmrwHv/nNbxy9e/du8Pna7XbHNddc47j00ks9Hs+aNWscAByffPJJg8dHjBjh6Nmzp+vft99+u+OKK64Q+hsv1q9fP0dMTIxw/PXXX+8A4HjnnXdcj507d86RkJDguOuuu1yP1dXVOc6dO9dg25MnTzo6dOjgeOihh1yPOc8Zq9XqqKysbBB/6623Olq2bOn4+eefXY99//33jrCwMMfFTaIDBw44WrRo4XjuuecabP/NN984wsLCmjxORCSDQxaJiPxs6NChKC4uxm233Yavv/4aL7zwAjIzM9G5c+cGQ84++ugj2O12TJ8+HaGhDS/XFw+vioqKcv3/6dOncezYMWRkZODXX3/Fnj17VB/nzp078f333+P+++/H8ePHcezYMRw7dgxnzpzBb37zG2zcuBF2u73BNg8//LBP+3jooYfQvn17dOrUCVlZWThz5gzefvttDBgwQDH+l19+wc6dO/Hggw8iLi7O9XifPn0wdOhQFBUV+f6HAvjqq69QWVmJ//mf/0FkZKTr8aysLCQlJaGwsFDV6wL1vWMdOnTAjTfeCKD+s8vOzsbSpUsbDCtdvXo1wsPDMX78eNdjoaGhmDRpUoPXO3HiBDZs2IB77rnH9XkfO3YMx48fR2ZmJr7//vsGwywbu+mmm9CuXTssW7bM9djJkyexbt06ZGdnux6LjY3FoUOHUFJS4tPfW11djdatW/u0TXR0NH73u9+5/m2xWDBw4MAGFTdbtGjh6tGz2+04ceIE6urqMGDAAGzfvr3Ja951110Ner4uXLiAf/3rXxg5cmSDgjGXXHKJqxfOafny5bDb7bjnnntc7++xY8eQkJCASy+9tMkwSSIiGRyySESkg7S0NCxfvhw2mw1ff/01PvzwQ8yePRt33303du7ciZSUFPz4448IDQ1FSkq
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Визуализация данных - ящик с усами. Как видим - выборка относительно сбалансированна, пускай и смещена в меньшую сторону. Пустых значений нет.\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"sns.boxplot(x=df[\"age\"])\n",
|
|||
|
"plt.title(\"Box Plot для age\")\n",
|
|||
|
"plt.xlabel(\"Age\")\n",
|
|||
|
"plt.show()\n",
|
|||
|
"\n",
|
|||
|
"#Визуализируем отношение стоимости и возраста\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"plt.scatter(df[\"age\"], df[\"charges\"])\n",
|
|||
|
"plt.xlabel('Age')\n",
|
|||
|
"plt.ylabel(\"Charge\")\n",
|
|||
|
"plt.title('Scatter Plot of Age vs Charge')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 110,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Выбросы:\n",
|
|||
|
" age sex bmi children smoker region charges\n",
|
|||
|
"14 27 male 42.13 0 yes southeast 39611.7577\n",
|
|||
|
"19 30 male 35.30 0 yes southwest 36837.4670\n",
|
|||
|
"23 34 female 31.92 1 yes northeast 37701.8768\n",
|
|||
|
"29 31 male 36.30 2 yes southwest 38711.0000\n",
|
|||
|
"30 22 male 35.60 0 yes southwest 35585.5760\n",
|
|||
|
"... ... ... ... ... ... ... ...\n",
|
|||
|
"2735 52 male 41.80 2 yes southeast 47269.8540\n",
|
|||
|
"2736 64 male 36.96 2 yes southeast 49577.6624\n",
|
|||
|
"2744 32 male 33.63 1 yes northeast 37607.5277\n",
|
|||
|
"2764 22 female 31.02 3 yes southeast 35595.5898\n",
|
|||
|
"2765 47 male 36.08 1 yes southeast 42211.1382\n",
|
|||
|
"\n",
|
|||
|
"[296 rows x 7 columns]\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA2QAAAIjCAYAAABswtioAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAC3pUlEQVR4nOzdeXhU1fkH8G/2BMIEIoYAYQmghBgiiIhRExGBCHFBUUEtW5HFElvBqs1PQCBiWq0FKyiiVlygoC0qhSBEEEGJGhBETEBZI0gIewgShmHm9wedKUnunTlzz83cuTPfz/PwtM68Z+7NrOe955z3hDgcDgeIiIiIiIjI50KNPgEiIiIiIqJgxYSMiIiIiIjIIEzIiIiIiIiIDMKEjIiIiIiIyCBMyIiIiIiIiAzChIyIiIiIiMggTMiIiIiIiIgMwoSMiIiIiIjIIEzIiIiIiIiIDMKEjIiI/Na+ffsQEhKCBQsWGH0qtXzyySfo1q0boqOjERISgpMnTxp9SqbVu3dvpKWlGX0aRESGYUJGRGSA77//Hvfeey/atWuH6OhotG7dGv369cPLL7/cYMdctGgRZs+eXe/2X375BdOmTcPWrVsb7Nh1rVu3DiEhIa5/ERER6NChA4YPH449e/bocoyNGzdi2rRpuidLx44dw/3334+YmBjMnTsX7777Lho3buyx3SuvvIKQkBD06tVL1/PxV1VVVZg+fTquvvpqxMbGIiYmBmlpaXjqqafwyy+/GH16RER+I9zoEyAiCjYbN27ELbfcgrZt22LMmDFITEzEzz//jK+++govvfQSHn300QY57qJFi7B9+3Y89thjtW7/5ZdfMH36dLRv3x7dunVrkGOr+f3vf4+ePXvi/Pnz+PbbbzF//nysWLEC33//PVq1aiX12Bs3bsT06dMxcuRING3aVJ8TBlBSUoLTp08jPz8fffv2FW63cOFCtG/fHt988w127dqFTp066XZO/mbPnj3o27cvysvLcd9992Hs2LGIjIzEtm3b8Oabb+LDDz/Ejz/+aPRpEhH5BSZkREQ+NnPmTMTFxaGkpKReolBZWWnMSTWAM2fOeBw5yszMxL333gsAGDVqFK688kr8/ve/x9tvv428vDxfnKbXnK+RN0ne3r17sXHjRixduhTjxo3DwoUL8cwzzzTQGRrLZrPhnnvuweHDh7Fu3TrcdNNNte6fOXMm/vKXv/j8nOx2OyIjI316XCIiEZyySETkY7t378ZVV12l2KFPSEiod9t7772H6667Do0aNUKzZs2QlZWF1atXu+7/+OOPkZOTg1atWiEqKgodO3ZEfn4+Lly44Irp3bs3VqxYgf3797umCbZv3x7r1q1Dz549AVxMiJz3Xbpm6+uvv8Ztt92GuLg4NGrUCDfffDO+/PLLWuc4bdo0hISEoLS0FA8++CCaNWtWryMuok+fPgAuJjDurF27FpmZmWjcuDGaNm2Ku+66C2VlZbXO54knngAAJCcnu/6uffv2uX3cDz74AD169EBMTAyaN2+O3/zmNzh48KDr/t69e2PEiBEAgJ49eyIkJAQjR470+HctXLgQzZo1Q05ODu69914sXLhQMe7YsWMYNmwYLBYLmjZtihEjRuC7775TXEe3Y8cO3HvvvYiPj0d0dDSuvfZaLFu2zO15nD9/HvHx8Rg1alS9+6qqqhAdHY0//vGPrttefvllXHXVVa733rXXXotFixa5Pca///1vfPfdd3j66acV3wMWiwUzZ86sd3tpaSluueUWNGrUCK1bt8bzzz9f636r1YqpU6eiR48eiIuLQ+PGjZGZmYnPPvusVpxz3eFf//pXzJ49Gx07dkRUVBRKS0sBXJwue+211yI6OhodO3bEa6+95nr/1vXee++53g/x8fEYOnQofv75Z7d/PxGRtzhCRkTkY+3atUNxcTG2b9/usZjB9OnTMW3aNNxwww2YMWMGIiMj8fXXX2Pt2rXo378/AGDBggWIjY3FpEmTEBsbi7Vr12Lq1KmoqqrCCy+8AAB4+umncerUKRw4cACzZs0CAMTGxqJLly6YMWMGpk6dirFjxyIzMxMAcMMNNwC4mPgMGDAAPXr0wDPPPIPQ0FC89dZb6NOnDzZs2IDrrruu1vned999uOKKK/Dcc8/B4XB4/dzs3r0bAHDZZZepxnz66acYMGAAOnTogGnTpuHs2bN4+eWXceONN+Lbb79F+/btcc899+DHH3/EP//5T8yaNQvNmzcHAFx++eWqj7tgwQKMGjUKPXv2REFBAQ4fPoyXXnoJX375JbZs2YKmTZvi6aefRufOnTF//nzMmDEDycnJ6Nixo8e/a+HChbjnnnsQGRmJBx54AK+++ipKSkpcyTAA2O123HHHHfjmm2/wyCOPICUlBR9//LErAbzUDz/8gBtvvBGtW7fGn/70JzRu3Bjvv/8+Bg0ahH//+9+4++67Fc8jIiICd999N5YuXYrXXnut1ojRRx99hHPnzmHo0KEAgNdffx2///3vce+99+IPf/gDampqsG3bNnz99dd48MEHVf9WZ1I4bNgwj8+L04kTJ3Dbbbfhnnvuwf33349//etfeOqpp9C1a1cMGDAAwMWE8Y033sADDzyAMWPG4PTp03jzzTeRnZ2Nb775pt5027feegs1NTUYO3YsoqKiEB8fjy1btuC2225Dy5YtMX36dFy4cAEzZsxQfF/MnDkTU6ZMwf3334+HH34YR44cwcsvv4ysrCzX+4GISBcOIiLyqdWrVzvCwsIcYWFhjoyMDMeTTz7pWLVqlcNqtdaK++mnnxyhoaGOu+++23HhwoVa99ntdtf///XXX+sdY9y4cY5GjRo5ampqXLfl5OQ42rVrVy+2pKTEAcDx1ltv1TvGFVdc4cjOzq53vOTkZEe/fv1ctz3zzDMOAI4HHnhA6Dn47LPPHAAc//jHPxxHjhxx/PLLL44VK1Y42rdv7wgJCXGUlJQ4HA6HY+/evfXOrVu3bo6EhATHsWPHXLd99913jtDQUMfw4cNdt73wwgsOAI69e/d6PB+r1epISEhwpKWlOc6ePeu6ffny5Q4AjqlTp7pue+uttxwAXOfoyaZNmxwAHEVFRQ6H4+LzmpSU5PjDH/5QK+7f//63A4Bj9uzZrtsuXLjg6NOnT73n4NZbb3V07dq11utrt9sdN9xwg+OKK65wez6rVq1yAHD85z//qXX7wIEDHR06dHD991133eW46qqrhP7GS3Xv3t0RFxcnHH/zzTc7ADjeeecd123nzp1zJCYmOgYPHuy6zWazOc6dO1er7YkTJxwtWrRw/Pa3v3Xd5nzPWCwWR2VlZa34O+64w9GoUSPHwYMHXbf99NNPjvDwcMelXaJ9+/Y5wsLCHDNnzqzV/vvvv3eEh4fXu52ISAanLBIR+Vi/fv1QXFyMO++8E9999x2ef/55ZGdno3Xr1rWmnH300Uew2+2YOnUqQkNrf11fOr0qJibG9f9Pnz6No0ePIjMzE7/++it27Nih+Ty3bt2Kn376CQ8++CCOHTuGo0eP4ujRozhz5gxuvfVWrF+/Hna7vVab8ePHe3WM3/72t7j88svRqlUr5OTk4MyZM3j77bdx7bXXKsYfOnQIW7duxciRIxEfH++6PT09Hf369UNhYaH3fyiATZs2obKyEr/73e8QHR3tuj0nJwcpKSlYsWKFpscFLo6OtWjRArfccguAi6/dkCFDsHjx4lrTSj/55BNERERgzJgxrttCQ0MxYcKEWo93/PhxrF27Fvfff7/r9T569CiOHTuG7Oxs/PTTT7WmWdbVp08fNG/eHEuWLHHdduLECRQVFWHIkCGu25o2bYoDBw6gpKTEq7+3qqoKTZo08apNbGwsfvOb37j+OzIyEtddd12tipthYWGuET273Y7jx4/DZrPh2muvxbffflvvMQcPHlxr5OvChQv49NNPMWjQoFoFYzp16uQahXNaunQp7HY77r//ftfze/ToUSQmJuKKK66oN02SiEgGpywSERmgZ8+eWLp0KaxWK7777jt8+OGHmDVrFu69915s3boVqamp2L17N0JDQ5Gamur2sX744Qd
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Есть шумы, убираем\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"# Статистический анализ для определения выбросов\n",
|
|||
|
"Q1 = df[\"charges\"].quantile(0.25)\n",
|
|||
|
"Q3 = df[\"charges\"].quantile(0.75)\n",
|
|||
|
"IQR = Q3 - Q1\n",
|
|||
|
"\n",
|
|||
|
"# Определение порога для выбросов\n",
|
|||
|
"threshold = 1.5 * IQR\n",
|
|||
|
"outliers = (df[\"charges\"] < (Q1 - threshold)) | (df[\"charges\"] > (Q3 + threshold))\n",
|
|||
|
"\n",
|
|||
|
"# Вывод выбросов\n",
|
|||
|
"print(\"Выбросы:\")\n",
|
|||
|
"print(df[outliers])\n",
|
|||
|
"\n",
|
|||
|
"# Обработка выбросов\n",
|
|||
|
"# В данном случае мы заменим выбросы на медианное значение\n",
|
|||
|
"median_charge = df[\"charges\"].median()\n",
|
|||
|
"df.loc[outliers, \"charges\"] = 0\n",
|
|||
|
"df = df[df.charges != 0]\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация данных после обработки\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"plt.scatter(df[\"age\"], df[\"charges\"])\n",
|
|||
|
"plt.xlabel(\"Age\")\n",
|
|||
|
"plt.ylabel(\"Charge\")\n",
|
|||
|
"plt.title(\"Scatter Plot of Age vs Charge\")\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Теперь создадим выборки."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 111,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки: 1485\n",
|
|||
|
"Размер контрольной выборки: 495\n",
|
|||
|
"Размер тестовой выборки: 496\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"\n",
|
|||
|
"train_df, temp_df = train_test_split(df, test_size=0.4, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение остатка на контрольную и тестовую выборки\n",
|
|||
|
"val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Проверка размеров выборок\n",
|
|||
|
"print(\"Размер обучающей выборки:\", len(train_df))\n",
|
|||
|
"print(\"Размер контрольной выборки:\", len(val_df))\n",
|
|||
|
"print(\"Размер тестовой выборки:\", len(test_df))\n",
|
|||
|
"\n",
|
|||
|
"# Сохранение выборок в файлы\n",
|
|||
|
"train_df.to_csv(\".//static//csv//train_data.csv\", index=False)\n",
|
|||
|
"val_df.to_csv(\".//static//csv//val_data.csv\", index=False)\n",
|
|||
|
"test_df.to_csv(\".//static//csv//test_data.csv\", index=False)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Проанализируем сбалансированность выборок"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 112,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение Review_type в обучающей выборке:\n",
|
|||
|
"sex\n",
|
|||
|
"female 765\n",
|
|||
|
"male 720\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент мужчин: 48.48%\n",
|
|||
|
"Процент женщин: 51.52%\n",
|
|||
|
"\n",
|
|||
|
"Распределение Review_type в контрольной выборке:\n",
|
|||
|
"sex\n",
|
|||
|
"male 257\n",
|
|||
|
"female 238\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент мужчин: 51.92%\n",
|
|||
|
"Процент женщин: 48.08%\n",
|
|||
|
"\n",
|
|||
|
"Распределение Review_type в тестовой выборке:\n",
|
|||
|
"sex\n",
|
|||
|
"female 259\n",
|
|||
|
"male 237\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент мужчин: 47.78%\n",
|
|||
|
"Процент женщин: 52.22%\n",
|
|||
|
"\n",
|
|||
|
"Аугментация данных не требуется.\n",
|
|||
|
"Аугментация данных не требуется.\n",
|
|||
|
"Аугментация данных не требуется.\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"train_df = pd.read_csv(\".//static//csv//train_data.csv\")\n",
|
|||
|
"val_df = pd.read_csv(\".//static//csv//val_data.csv\")\n",
|
|||
|
"test_df = pd.read_csv(\".//static//csv//test_data.csv\")\n",
|
|||
|
"\n",
|
|||
|
"# Оценка сбалансированности\n",
|
|||
|
"def check_balance(df, name):\n",
|
|||
|
" counts = df['sex'].value_counts()\n",
|
|||
|
" print(f\"Распределение Review_type в {name}:\")\n",
|
|||
|
" print(counts)\n",
|
|||
|
" print(f\"Процент мужчин: {counts['male'] / len(df) * 100:.2f}%\")\n",
|
|||
|
" print(f\"Процент женщин: {counts['female'] / len(df) * 100:.2f}%\")\n",
|
|||
|
" print()\n",
|
|||
|
"\n",
|
|||
|
"# Определение необходимости аугментации данных\n",
|
|||
|
"def need_augmentation(df):\n",
|
|||
|
" counts = df['sex'].value_counts()\n",
|
|||
|
" ratio = counts['male'] / counts['female']\n",
|
|||
|
" if ratio > 1.5 or ratio < 0.67:\n",
|
|||
|
" print(\"Необходима аугментация данных для балансировки классов.\")\n",
|
|||
|
" else:\n",
|
|||
|
" print(\"Аугментация данных не требуется.\")\n",
|
|||
|
" \n",
|
|||
|
"check_balance(train_df, \"обучающей выборке\")\n",
|
|||
|
"check_balance(val_df, \"контрольной выборке\")\n",
|
|||
|
"check_balance(test_df, \"тестовой выборке\")\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"need_augmentation(train_df)\n",
|
|||
|
"need_augmentation(val_df)\n",
|
|||
|
"need_augmentation(test_df)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"По результатам анализа требуется приращение, соотношения отзывов вне допустимого диапазона"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 113,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Оверсэмплинг:\n",
|
|||
|
"Распределение sex в обучающей выборке:\n",
|
|||
|
"sex\n",
|
|||
|
"male 765\n",
|
|||
|
"female 765\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент мужчин: 50.00%\n",
|
|||
|
"Процент женщин: 50.00%\n",
|
|||
|
"\n",
|
|||
|
"Распределение sex в контрольной выборке:\n",
|
|||
|
"sex\n",
|
|||
|
"male 257\n",
|
|||
|
"female 257\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент мужчин: 50.00%\n",
|
|||
|
"Процент женщин: 50.00%\n",
|
|||
|
"\n",
|
|||
|
"Распределение sex в тестовой выборке:\n",
|
|||
|
"sex\n",
|
|||
|
"female 259\n",
|
|||
|
"male 259\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент мужчин: 50.00%\n",
|
|||
|
"Процент женщин: 50.00%\n",
|
|||
|
"\n",
|
|||
|
"Андерсэмплинг:\n",
|
|||
|
"Распределение sex в обучающей выборке:\n",
|
|||
|
"sex\n",
|
|||
|
"female 720\n",
|
|||
|
"male 720\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент мужчин: 50.00%\n",
|
|||
|
"Процент женщин: 50.00%\n",
|
|||
|
"\n",
|
|||
|
"Распределение sex в контрольной выборке:\n",
|
|||
|
"sex\n",
|
|||
|
"female 238\n",
|
|||
|
"male 238\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент мужчин: 50.00%\n",
|
|||
|
"Процент женщин: 50.00%\n",
|
|||
|
"\n",
|
|||
|
"Распределение sex в тестовой выборке:\n",
|
|||
|
"sex\n",
|
|||
|
"female 237\n",
|
|||
|
"male 237\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент мужчин: 50.00%\n",
|
|||
|
"Процент женщин: 50.00%\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"from imblearn.over_sampling import RandomOverSampler\n",
|
|||
|
"from imblearn.under_sampling import RandomUnderSampler\n",
|
|||
|
"from sklearn.preprocessing import LabelEncoder\n",
|
|||
|
"\n",
|
|||
|
"# Загрузка данных\n",
|
|||
|
"train_df = pd.read_csv(\".//static//csv//train_data.csv\")\n",
|
|||
|
"val_df = pd.read_csv(\".//static//csv//val_data.csv\")\n",
|
|||
|
"test_df = pd.read_csv(\".//static//csv//test_data.csv\")\n",
|
|||
|
"\n",
|
|||
|
"# Преобразование категориальных признаков в числовые\n",
|
|||
|
"def encode(df):\n",
|
|||
|
" label_encoders = {}\n",
|
|||
|
" for column in df.select_dtypes(include=['object']).columns:\n",
|
|||
|
" if column != 'sex': # Пропускаем целевую переменную\n",
|
|||
|
" le = LabelEncoder()\n",
|
|||
|
" df[column] = le.fit_transform(df[column])\n",
|
|||
|
" label_encoders[column] = le\n",
|
|||
|
" return label_encoders\n",
|
|||
|
"\n",
|
|||
|
"# Преобразование целевой переменной в числовые значения\n",
|
|||
|
"def encode_target(df):\n",
|
|||
|
" le = LabelEncoder()\n",
|
|||
|
" df['sex'] = le.fit_transform(df['sex'])\n",
|
|||
|
" return le\n",
|
|||
|
"\n",
|
|||
|
"# Применение кодирования\n",
|
|||
|
"label_encoders = encode(train_df)\n",
|
|||
|
"encode(val_df)\n",
|
|||
|
"encode(test_df)\n",
|
|||
|
"\n",
|
|||
|
"# Кодирование целевой переменной\n",
|
|||
|
"le_target = encode_target(train_df)\n",
|
|||
|
"encode_target(val_df)\n",
|
|||
|
"encode_target(test_df)\n",
|
|||
|
"\n",
|
|||
|
"# Проверка типов данных\n",
|
|||
|
"def check_data_types(df):\n",
|
|||
|
" for column in df.columns:\n",
|
|||
|
" if df[column].dtype == 'object':\n",
|
|||
|
" print(f\"Столбец '{column}' содержит строковые данные.\")\n",
|
|||
|
"\n",
|
|||
|
"check_data_types(train_df)\n",
|
|||
|
"check_data_types(val_df)\n",
|
|||
|
"check_data_types(test_df)\n",
|
|||
|
"\n",
|
|||
|
"# Функция для выполнения oversampling\n",
|
|||
|
"def oversample(df):\n",
|
|||
|
" if 'sex' not in df.columns:\n",
|
|||
|
" print(\"Столбец 'sex' отсутствует.\")\n",
|
|||
|
" return df\n",
|
|||
|
" \n",
|
|||
|
" X = df.drop('sex', axis=1)\n",
|
|||
|
" y = df['sex']\n",
|
|||
|
" \n",
|
|||
|
" oversampler = RandomOverSampler(random_state=42)\n",
|
|||
|
" X_resampled, y_resampled = oversampler.fit_resample(X, y) # type: ignore\n",
|
|||
|
" \n",
|
|||
|
" resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
|
|||
|
" return resampled_df\n",
|
|||
|
"\n",
|
|||
|
"# Функция для выполнения undersampling\n",
|
|||
|
"def undersample(df):\n",
|
|||
|
" if 'sex' not in df.columns:\n",
|
|||
|
" print(\"Столбец 'sex' отсутствует.\")\n",
|
|||
|
" return df\n",
|
|||
|
" \n",
|
|||
|
" X = df.drop('sex', axis=1)\n",
|
|||
|
" y = df['sex']\n",
|
|||
|
" \n",
|
|||
|
" undersampler = RandomUnderSampler(random_state=42)\n",
|
|||
|
" X_resampled, y_resampled = undersampler.fit_resample(X, y) # type: ignore\n",
|
|||
|
" \n",
|
|||
|
" resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
|
|||
|
" return resampled_df\n",
|
|||
|
"\n",
|
|||
|
"# Применение oversampling и undersampling к каждой выборке\n",
|
|||
|
"train_df_oversampled = oversample(train_df)\n",
|
|||
|
"val_df_oversampled = oversample(val_df)\n",
|
|||
|
"test_df_oversampled = oversample(test_df)\n",
|
|||
|
"\n",
|
|||
|
"train_df_undersampled = undersample(train_df)\n",
|
|||
|
"val_df_undersampled = undersample(val_df)\n",
|
|||
|
"test_df_undersampled = undersample(test_df)\n",
|
|||
|
"\n",
|
|||
|
"# Обратное преобразование целевой переменной в строковые метки\n",
|
|||
|
"def decode_target(df, le_target):\n",
|
|||
|
" df['sex'] = le_target.inverse_transform(df['sex'])\n",
|
|||
|
"\n",
|
|||
|
"decode_target(train_df_oversampled, le_target)\n",
|
|||
|
"decode_target(val_df_oversampled, le_target)\n",
|
|||
|
"decode_target(test_df_oversampled, le_target)\n",
|
|||
|
"\n",
|
|||
|
"decode_target(train_df_undersampled, le_target)\n",
|
|||
|
"decode_target(val_df_undersampled, le_target)\n",
|
|||
|
"decode_target(test_df_undersampled, le_target)\n",
|
|||
|
"\n",
|
|||
|
"# Проверка результатов\n",
|
|||
|
"def check_balance(df, name):\n",
|
|||
|
" if 'sex' not in df.columns:\n",
|
|||
|
" print(f\"Столбец 'sex' отсутствует в {name}.\")\n",
|
|||
|
" return\n",
|
|||
|
" \n",
|
|||
|
" counts = df['sex'].value_counts()\n",
|
|||
|
" print(f\"Распределение sex в {name}:\")\n",
|
|||
|
" print(counts)\n",
|
|||
|
" \n",
|
|||
|
" if 'male' in counts and 'female' in counts:\n",
|
|||
|
" print(f\"Процент мужчин: {counts['male'] / len(df) * 100:.2f}%\")\n",
|
|||
|
" print(f\"Процент женщин: {counts['female'] / len(df) * 100:.2f}%\")\n",
|
|||
|
" else:\n",
|
|||
|
" print(\"Отсутствуют один или оба класса (male/female.\")\n",
|
|||
|
" print()\n",
|
|||
|
"\n",
|
|||
|
"# Проверка сбалансированности после oversampling\n",
|
|||
|
"print(\"Оверсэмплинг:\")\n",
|
|||
|
"check_balance(train_df_oversampled, \"обучающей выборке\")\n",
|
|||
|
"check_balance(val_df_oversampled, \"контрольной выборке\")\n",
|
|||
|
"check_balance(test_df_oversampled, \"тестовой выборке\")\n",
|
|||
|
"\n",
|
|||
|
"# Проверка сбалансированности после undersampling\n",
|
|||
|
"print(\"Андерсэмплинг:\")\n",
|
|||
|
"check_balance(train_df_undersampled, \"обучающей выборке\")\n",
|
|||
|
"check_balance(val_df_undersampled, \"контрольной выборке\")\n",
|
|||
|
"check_balance(test_df_undersampled, \"тестовой выборке\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Pima Indians Diabetes Database"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"https://www.kaggle.com/datasets/uciml/pima-indians-diabetes-database Датасет диабетов\n",
|
|||
|
"Использование: предугадывание склонности к диабету\n",
|
|||
|
"Входные данные: \"Беременности\", \"Глюкоза\", \"Кровяное давление\", \"Толщина кожи\", Инсулин, Диабет, Возраст"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 114,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',\n",
|
|||
|
" 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],\n",
|
|||
|
" dtype='object')\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"df = pd.read_csv(\".//static//csv//diabetes.csv\")\n",
|
|||
|
"print(df.columns)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Анализируем датафрейм при помощи \"ящика с усами\". Естьсмещение в сторону меньших значений, это можно исправить при помощи oversampling и undersampling."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 115,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAx8AAAIjCAYAAABia6bHAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAqjElEQVR4nO3de5hVdb348c9cmGFSGFQUGLkIaqAkSkqGhEbeUjRMJS1QCT2YQopZiQdTLD2al2OpKdBRsSBNzCQTjuINj8Yp0AfxUoA6XlEIkYs4yGXW74/zYz/uuM0gfDczvF7Psx+dtb57zXf4tmze7LX2LsqyLAsAAIBtrLjQEwAAAHYM4gMAAEhCfAAAAEmIDwAAIAnxAQAAJCE+AACAJMQHAACQhPgAAACSEB8AAEAS4gOgkXvjjTeiqKgoxo0bV+ipALCDEx8AdTRu3LgoKirKe+yxxx7Rp0+fmDJlSvL5PPXUU3lzadKkSXTq1CnOPPPMeP3117fK9/jLX/4So0aNiiVLlmyV4xXa5MmTo6ioKKqqqqK2trbQ0wHY4ZQWegIADc1Pf/rT6NixY2RZFgsWLIhx48bF8ccfHw899FCccMIJyedzwQUXRI8ePWL16tXx/PPPx9ixY+Phhx+OF198Maqqqj7Tsf/yl7/ElVdeGYMGDYoWLVpsnQkX0IQJE2KvvfaKN954I5544ok46qijCj0lgB2KVz4A6um4446LgQMHxhlnnBE//OEP43/+53+iSZMmcc899xRkPr17946BAwfGd7/73bjlllvihhtuiMWLF8fdd99dkPlsr1asWBGTJk2KH/zgB9G9e/eYMGFCoacEsMMRHwCfUYsWLaKioiJKS/NfTF6xYkVcfPHF0a5duygvL4/OnTvHDTfcEFmWRURETU1NdOnSJbp06RI1NTW55y1evDjatGkThx12WKxdu7be8/na174WERHV1dWbHPfEE09E7969Y6eddooWLVpEv3794u9//3tu/6hRo+JHP/pRRER07Ngxd3nXG2+8sdk5/OvlaeseTz311HpjBw0atMGxo0aNyht3//33xyGHHBLNmjXLG3fDDTdsdj4REX/84x+jpqYm+vfvH6effno88MADsXLlyvXG1dTUxAUXXBAtW7aMZs2axTe+8Y149913Nzind999NwYPHhytWrWK8vLy6Nq1a9x55511mg/AjshlVwD1tHTp0li0aFFkWRYLFy6MW265JT766KMYOHBgbkyWZfGNb3wjnnzyyTj77LPjoIMOikceeSR+9KMfxbvvvhs33XRTVFRUxN133x29evWKkSNHxn/+539GRMTQoUNj6dKlMW7cuCgpKan3/F577bWIiNhtt902Ouaxxx6L4447Ljp16hSjRo2KmpqauOWWW6JXr17x/PPPx1577RUnn3xyzJ07N+6555646aabomXLlhERsfvuu9dpHkcffXSceeaZERExY8aMuPnmmzc6tmXLlnHTTTflvj7jjDPy9k+fPj2+9a1vxYEHHhjXXnttVFZWxqJFi+Kiiy6q01wi/u+Sqz59+kTr1q3j9NNPjxEjRsRDDz0U/fv3zxs3aNCguO++++KMM86IL3/5yzFt2rTo27fvesdbsGBBfPnLX46ioqIYNmxY7L777jFlypQ4++yzY9myZTF8+PA6zw1gh5EBUCd33XVXFhHrPcrLy7Nx48bljX3wwQeziMiuuuqqvO2nnnpqVlRUlL366qu5bZdeemlWXFycPf3009nEiROziMh+8YtfbHY+Tz75ZBYR2Z133pn985//zObPn589/PDD2V577ZUVFRVlM2bMyLIsy6qrq7OIyO66667ccw866KBsjz32yD744IPcthdeeCErLi7OzjzzzNy266+/PouIrLq6us5/TqtWrcoiIhs2bFhu27qf68knn1xv/IABA7KOHTvmbYuI7Iorrsh9femll2YRkb333nu5bet+ruuvv36zc1qwYEFWWlqa/frXv85tO+yww7J+/frljXvuueeyiMiGDx+et33QoEHrzenss8/O2rRpky1atChv7Omnn55VVlZmH3/88WbnBbCjcdkVQD396le/iqlTp8bUqVNj/Pjx0adPnzjnnHPigQceyI2ZPHlylJSUxAUXXJD33IsvvjiyLMt7d6xRo0ZF165d46yzzorzzz8/jjjiiPWetymDBw+O3XffPaqqqqJv376xYsWKuPvuu+OQQw7Z4Pj33nsvZs2aFYMGDYpdd901t71bt25x9NFHx+TJk+v8vTdk3aVMTZs2rdP4VatWRXl5+SbHLF++PIqLi7f4pvd77703iouL45RTTslt+/a3vx1TpkyJDz/8MLftv//7vyMi4vzzz897/ve///28r7Msiz/84Q9x4oknRpZlsWjRotzj2GOPjaVLl8bzzz+/RXMFaMxcdgVQT1/60pfyfrH/9re/Hd27d49hw4bFCSecEGVlZfHmm29GVVVVNGvWLO+5++23X0REvPnmm7ltZWVlceedd0aPHj2iadOmcdddd0VRUVGd53P55ZdH7969o6SkJFq2bBn77bffeveffNq67925c+f19u23337xyCOPxIoVK2KnnXaq8xw+bdGiRRERUVlZWafxS5YsiZ133nmTY3r27Bm33nprXHjhhfHjH/84Kisr86Jhc8aPHx9f+tKX4oMPPogPPvggIiK6d+8eq1atiokTJ8aQIUMi4v/+bIqLi6Njx455z99nn33yvv7nP/8ZS5YsibFjx8bYsWM3+D0XLlxY5/kB7CjEB8BnVFxcHH369Ilf/vKXMW/evOjatWu9j/HII49ExP+9ajBv3rz1fvndlAMOOGC7esvYdTek77XXXnUa//7770eHDh02Oeb000+P559/Pm655ZaN/rK/MfPmzYsZM2ZERMS+++673v4JEybk4qOu1n1GyMCBA+Oss87a4Jhu3brV65gAOwLxAbAVrFmzJiIiPvroo4iI6NChQzz22GOxfPnyvFc//vGPf+T2rzN79uz46U9/Gt/97ndj1qxZcc4558SLL75Y51cO6mvd954zZ856+/7xj39Ey5Ytc6961OcVmHVmzpwZEbHRy74+bfXq1fHqq6/G17/+9U2OKy4ujhtuuCFefPHFqK6ujttuuy0WLFiQd5P/xkyYMCGaNGkSv/3tb9e7gf+ZZ56Jm2++Od56661o3759dOjQIWpra6O6ujovVF599dW85+2+++7RrFmzWLt27XYVfgDbO/d8AHxGq1evjkcffTTKyspyl1Udf/zxsXbt2rj11lvzxt50001RVFQUxx13XO65gwYNiqqqqvjlL38Z48aNiwULFtTrXZzqq02bNnHQQQfF3XffnffJ5S+99FI8+uijcfzxx+e2rYuQ+nzC+f333x+dO3eOLl26bHbspEmToqamJvf2wJtyyy23xBNPPBETJkyIo446Knr16lWn+UyYMCF69+4dp512Wpx66ql5j3VvJbzuM1qOPfbYiIi47bbb1vven1ZSUhKnnHJK/OEPf4iXXnppve/5z3/+s05zA9jReOUDoJ6mTJmSewVj4cKF8bvf/S7mzZsXI0aMiObNm0dExIknnhh9+vSJkSNHxhtvvBEHHnhgPProozFp0qQYPnx47L333hERcdVVV8WsWbPi8ccfj2bNmkW3bt3i8ssvj8suuyxOPfXUvBDYmq6//vo47rjjomfPnnH22Wfn3mq3srIy77MsDj744IiIGDlyZJx++unRpEmTOPHEEzd4P8jrr78e1113Xfztb3+Lk08+OcaPH5/bt+6yp6lTp0b79u2jdevWccUVV8Rtt90Whx12WBxzzDGbnO/LL78cP/7xj2PUqFHRo0ePOv+cf/3rX+PVV1+NYcOGbXD/nnvuGV/84hdjwoQJcckll8TBBx8cp5xySvziF7+IDz74IPdWu3Pnzo2I/FeCrr322njyySfj0EMPjX/7t3+L/fffPxYvXhzPP/98PPbYY7F48eI6zxNgh1Hgd9sCaDA29Fa7TZs2zQ466KDs9ttvz2pra/PGL1++PLvooouyqqqqrEmTJtm+++6bXX/99blxzz33XFZ
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"\n",
|
|||
|
"# Box plot для столбца 'Popularity'\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"sns.boxplot(x=df['Age'])\n",
|
|||
|
"plt.title('Box Plot для Age')\n",
|
|||
|
"plt.xlabel('Age')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 116,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA04AAAIjCAYAAAA0vUuxAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACyfklEQVR4nOzdeXhTVfoH8G9SugItLaW0rC2FEUoZ9qUgosgqAyI6Io4CioziLo4LMyOLzIjouI06IPhTHFBRR1RQqCKggBZBWQQBRSig0IK0tIVCW2ju749yQ9JmOecm9+Ym+X6eh+ehyentyc3NzXnP8h6LoigKiIiIiIiIyC1roCtARERERERkdgyciIiIiIiIvGDgRERERERE5AUDJyIiIiIiIi8YOBEREREREXnBwImIiIiIiMgLBk5EREREREReMHAiIiIiIiLygoETERERERGRFwyciIguOHjwICwWCxYtWhToqjjJzc1Fly5dEBMTA4vFgpKSkkBXKah98cUXsFgs+OKLL6R/d+LEiWjQoIH/KxUiJk6ciPT0dKfHLBYLZs6cGZD6hDpX55uI9MPAiSgM7Ny5E9dddx1at26NmJgYNG/eHIMHD8aLL76o299866238Pzzz9d5/OjRo5g5cya2b9+u29+uTW0oq/8iIyPRpk0bjB8/HgcOHPDL3/j6668xc+ZMvwc1RUVFuP766xEbG4uXX34ZixcvRv369b3+3n/+8x9YLBb07t3br/UxGzXYdXxvk5OT0bdvX/z1r3/F4cOHA11Fr/7zn//oHqwb8RkIdunp6U7nyPFfRUVFwOoViHsmEblWL9AVICJ9ff3117jiiivQqlUrTJ48Gampqfjll1+wadMmvPDCC7jnnnt0+btvvfUWdu3ahfvvv9/p8aNHj2LWrFlIT09Hly5ddPnb7tx7773o2bMnzp07h61bt2LBggX45JNPsHPnTjRr1synY3/99deYNWsWJk6ciEaNGvmnwgC2bNmCU6dOYfbs2Rg0aJDw77355ptIT0/H5s2b8fPPP6Nt27Z+q5MZjRs3DldddRVsNhtOnjyJLVu24Pnnn8cLL7yA//u//8MNN9xgL3vZZZfh7NmziIqKCmCNL/rPf/6D5ORkTJw4Ufe/pednwJ2zZ8+iXr3gaG506dIFDz74YJ3HA3mteLpnLly4EDabLTAVIwpDwXEnIyLN/vnPfyIhIQFbtmyp06A/fvx4YCqlg/Lycq8jMf3798d1110HALjlllvwu9/9Dvfeey/eeOMNTJs2zYhqSlPfI5lgLD8/H19//TWWLVuG22+/HW+++SZmzJihUw3NoVu3brjpppucHjt06BCGDBmCCRMmoEOHDujcuTMAwGq1IiYmJhDVDLhAfAb8ea5tNhuqqqp0e/+aN29e5zoys8jIyEBXgSiscKoeUYjbv38/Onbs6LLhnZKSUuexJUuWoFevXoiLi0NiYiIuu+wyfPbZZ/bnP/roI4wYMQLNmjVDdHQ0MjMzMXv2bFRXV9vLXH755fjkk09w6NAh+1SX9PR0fPHFF+jZsyeAmkab+pzjNKVvvvkGw4YNQ0JCAuLi4jBgwAB89dVXTnWcOXMmLBYLdu/ejRtvvBGJiYm49NJLpc/NwIEDAdQEGp6sXbsW/fv3R/369dGoUSNcffXV2LNnj1N9HnroIQBARkaG/XUdPHjQ43Hfe+89dO/eHbGxsUhOTsZNN92EI0eO2J+//PLLMWHCBABAz549YbFYhEYl3nzzTSQmJmLEiBG47rrr8Oabb7osV1RUhJtvvhnx8fFo1KgRJkyYgB07drhc57V3715cd911SEpKQkxMDHr06IHly5d7rMe5c+eQlJSEW265pc5zZWVliImJwV/+8hf7Yy+++CI6duxov/Z69OiBt956y+vrdad169ZYtGgRqqqq8NRTT9kfd7XGacOGDfjjH/+IVq1aITo6Gi1btsQDDzyAs2fPujz2gQMHMHToUNSvXx/NmjXD448/DkVRnMrYbDY8//zz6NixI2JiYtC0aVPcfvvtOHnypL1Meno6fvjhB3z55Zf26+byyy+3P19SUoL7778fLVu2RHR0NNq2bYu5c+fWGWVYunQpunfvjoYNGyI+Ph6dOnXCCy+84PUcufoMrFq1yn69N2zYECNGjMAPP/xQ53c//PBDZGdnIyYmBtnZ2fjggw9c/g1Xa5y++OIL9OjRAzExMcjMzMQrr7xi/1zX/t27774bb775Jjp27Ijo6Gjk5uYCAI4cOYJbb70VTZs2RXR0NDp27IjXXnutzt+vrKzEjBkz0LZtW/t7+/DDD6OystLr+XHkqn4AsGjRojqf9/T0dPzhD3/Axo0b0atXL8TExKBNmzb473//W+f3S0pK8MADDyA9PR3R0dFo0aIFxo8fjxMnTni9Z7pa41ReXo4HH3zQfs1ccskl+Ne//lXn+lTPrfo+qudQPb9EVBdHnIhCXOvWrZGXl4ddu3YhOzvbY9lZs2Zh5syZ6Nu3Lx5//HFERUXhm2++wdq1azFkyBAANY2EBg0aYOrUqWjQoAHWrl2L6dOno6ysDE8//TQA4G9/+xtKS0vx66+/4rnnngMANGjQAB06dMDjjz+O6dOn489//jP69+8PAOjbty+AmgBl+PDh6N69O2bMmAGr1YrXX38dAwcOxIYNG9CrVy+n+v7xj39Eu3bt8MQTT9RpFIjYv38/AKBx48Zuy3z++ecYPnw42rRpg5kzZ+Ls2bN48cUX0a9fP2zduhXp6ekYM2YMfvrpJ7z99tt47rnnkJycDABo0qSJ2+MuWrQIt9xyC3r27Ik5c+bg2LFjeOGFF/DVV19h27ZtaNSoEf72t7/hkksuwYIFC/D4448jIyMDmZmZXl/Xm2++iTFjxiAqKgrjxo3DvHnzsGXLFnsDDKhp1I8cORKbN2/GlClT0L59e3z00Uf2QM3RDz/8gH79+qF58+Z49NFHUb9+fbz77rsYPXo03n//fVxzzTUu6xEZGYlrrrkGy5YtwyuvvOI03enDDz9EZWWlfQrdwoULce+99+K6667Dfffdh4qKCnz//ff45ptvcOONN3p9ze7k5OQgMzMTq1ev9ljuvffew5kzZzBlyhQ0btwYmzdvxosvvohff/0V7733nlPZ6upqDBs2DH369MFTTz2F3NxczJgxA+fPn8fjjz9uL3f77bfb3+d7770X+fn5eOmll7Bt2zZ89dVXiIyMxPPPP4977rkHDRo0wN/+9jcAQNOmTQEAZ86cwYABA3DkyBHcfvvtaNWqFb7++mtMmzYNBQUF9jWEq1evxrhx43DllVdi7ty5AIA9e/bgq6++wn333efxddf+DCxevBgTJkzA0KFDMXfuXJw5cwbz5s3DpZdeim3bttkb6Z999hmuvfZaZGVlYc6cOSgqKsItt9yCFi1aeH1Ptm3bhmHDhiEtLQ2zZs1CdXU1Hn/8cbefl7Vr1+Ldd9/F3XffjeTkZKSnp+PYsWPo06ePvfHfpEkTrFq1CpMmTUJZWZl9irDNZsOoUaOwceNG/PnPf0aHDh2wc+dOPPfcc/jpp5/w4YcfOv2tc+fO4cSJE06PxcXFIS4uzuvrqu3nn3/Gddddh0mTJmHChAl47bXXMHHiRHTv3h0dO3YEAJw+fRr9+/fHnj17cOutt6Jbt244ceIEli9fjl9//dXrPbM2RVEwatQorFu3DpMmTUKXLl3w6aef4qGHHsKRI0fs92PVxo0bsWzZMtx5551o2LAh/v3vf+Paa6/F4cOHPd4XicKWQkQh7bPPPlMiIiKUiIgIJScnR3n44YeVTz/9VKmqqnIqt2/fPsVqtSrXXHONUl1d7fSczWaz///MmTN1/sbtt9+uxMXFKRUVFfbHRowYobRu3bpO2S1btigAlNdff73O32jXrp0ydOjQOn8vIyNDGTx4sP2xGTNmKACUcePGCZ2DdevWKQCU1157Tfntt9+Uo0ePKp988omSnp6uWCwWZcuWLYqiKEp+fn6dunXp0kVJSUlRioqK7I/t2LFDsVqtyvjx4+2PPf300woAJT8/32t9qqqqlJSUFCU7O1s5e/a
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Выбросы:\n",
|
|||
|
" Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
|
|||
|
"4 0 137 40 35 168 43.1 \n",
|
|||
|
"12 10 139 80 0 0 27.1 \n",
|
|||
|
"39 4 111 72 47 207 37.1 \n",
|
|||
|
"45 0 180 66 39 0 42.0 \n",
|
|||
|
"58 0 146 82 0 0 40.5 \n",
|
|||
|
"100 1 163 72 0 0 39.0 \n",
|
|||
|
"147 2 106 64 35 119 30.5 \n",
|
|||
|
"187 1 128 98 41 58 32.0 \n",
|
|||
|
"218 5 85 74 22 0 29.0 \n",
|
|||
|
"228 4 197 70 39 744 36.7 \n",
|
|||
|
"243 6 119 50 22 176 27.1 \n",
|
|||
|
"245 9 184 85 15 0 30.0 \n",
|
|||
|
"259 11 155 76 28 150 33.3 \n",
|
|||
|
"292 2 128 78 37 182 43.3 \n",
|
|||
|
"308 0 128 68 19 180 30.5 \n",
|
|||
|
"330 8 118 72 19 0 23.1 \n",
|
|||
|
"370 3 173 82 48 465 38.4 \n",
|
|||
|
"371 0 118 64 23 89 0.0 \n",
|
|||
|
"383 1 90 62 18 59 25.1 \n",
|
|||
|
"395 2 127 58 24 275 27.7 \n",
|
|||
|
"445 0 180 78 63 14 59.4 \n",
|
|||
|
"534 1 77 56 30 56 33.3 \n",
|
|||
|
"593 2 82 52 22 115 28.5 \n",
|
|||
|
"606 1 181 78 42 293 40.0 \n",
|
|||
|
"618 9 112 82 24 0 28.2 \n",
|
|||
|
"621 2 92 76 20 0 24.2 \n",
|
|||
|
"622 6 183 94 0 0 40.8 \n",
|
|||
|
"659 3 80 82 31 70 34.2 \n",
|
|||
|
"661 1 199 76 43 0 42.9 \n",
|
|||
|
"\n",
|
|||
|
" DiabetesPedigreeFunction Age Outcome \n",
|
|||
|
"4 2.288 33 1 \n",
|
|||
|
"12 1.441 57 0 \n",
|
|||
|
"39 1.390 56 1 \n",
|
|||
|
"45 1.893 25 1 \n",
|
|||
|
"58 1.781 44 0 \n",
|
|||
|
"100 1.222 33 1 \n",
|
|||
|
"147 1.400 34 0 \n",
|
|||
|
"187 1.321 33 1 \n",
|
|||
|
"218 1.224 32 1 \n",
|
|||
|
"228 2.329 31 0 \n",
|
|||
|
"243 1.318 33 1 \n",
|
|||
|
"245 1.213 49 1 \n",
|
|||
|
"259 1.353 51 1 \n",
|
|||
|
"292 1.224 31 1 \n",
|
|||
|
"308 1.391 25 1 \n",
|
|||
|
"330 1.476 46 0 \n",
|
|||
|
"370 2.137 25 1 \n",
|
|||
|
"371 1.731 21 0 \n",
|
|||
|
"383 1.268 25 0 \n",
|
|||
|
"395 1.600 25 0 \n",
|
|||
|
"445 2.420 25 1 \n",
|
|||
|
"534 1.251 24 0 \n",
|
|||
|
"593 1.699 25 0 \n",
|
|||
|
"606 1.258 22 1 \n",
|
|||
|
"618 1.282 50 1 \n",
|
|||
|
"621 1.698 28 0 \n",
|
|||
|
"622 1.461 45 0 \n",
|
|||
|
"659 1.292 27 1 \n",
|
|||
|
"661 1.394 22 1 \n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA04AAAIjCAYAAAA0vUuxAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAADDWUlEQVR4nOzdeXhU1f0/8PdMSAIBEhICJGxJCFZWAUEgAlqRTSlI1a+ILYLSuC8Vq0hbWbSKuPtTKxq1Km5YpUoLxoWlgAahICiCiiGAYgKSAAGChGTu7494h5nMds6duXfOnXm/nsfnkcnJ5Myd7XzO+ZzPcWiapoGIiIiIiIgCcka7A0RERERERKpj4ERERERERBQCAyciIiIiIqIQGDgRERERERGFwMCJiIiIiIgoBAZOREREREREITBwIiIiIiIiCoGBExERERERUQgMnIiIiIiIiEJg4ERE9Itdu3bB4XDgpZdeinZXvBQXF6Nv375o2rQpHA4HDh06FO0u2dqqVavgcDiwatUq6d+dOnUqWrRoEflOxYipU6ciNzfX6zaHw4E5c+ZEpT+xzt/1JiLzMHAiigNffvklLr30UuTk5KBp06bo0KEDRo4ciSeffNK0v/n666/j8ccf97n9xx9/xJw5c7B582bT/nZj+kBZ/y8xMRFdunTBlVdeiZ07d0bkb3z66aeYM2dOxIOayspKXHbZZWjWrBmefvppLFy4EM2bNw/5e3//+9/hcDgwaNCgiPZHNXqw6/ncZmZm4uyzz8af//xn7NmzJ9pdDOnvf/+76cG6Fe8Bu8vNzfW6Rp7//fzzz1HrVzQ+M4nIvybR7gARmevTTz/Feeedh86dO6OwsBBZWVn4/vvvsW7dOjzxxBO4+eabTfm7r7/+OrZu3Yo//vGPXrf/+OOPmDt3LnJzc9G3b19T/nYgt9xyC8466yycPHkSmzZtwnPPPYelS5fiyy+/RPv27cO6708//RRz587F1KlT0apVq8h0GMCGDRtw5MgR3HvvvRgxYoTw77322mvIzc3F+vXr8d1336Fr164R65OKJk2ahAsvvBAulwsHDx7Ehg0b8Pjjj+OJJ57ACy+8gMsvv9zd9pxzzsHx48eRlJQUxR6f8ve//x2ZmZmYOnWq6X/LzPdAIMePH0eTJvYYbvTt2xe33367z+3RfK0E+8wsKiqCy+WKTseI4pA9PsmIyLD77rsPaWlp2LBhg8+Afv/+/dHplAmOHTsWciVm2LBhuPTSSwEAV111FX71q1/hlltuwcsvv4yZM2da0U1p+nMkE4yVlZXh008/xeLFi3Httdfitddew+zZs03qoRrOPPNM/P73v/e6bffu3Rg1ahSmTJmC7t27o0+fPgAAp9OJpk2bRqObUReN90Akr7XL5UJtba1pz1+HDh18XkcqS0xMjHYXiOIKU/WIYlxpaSl69uzpd+Ddtm1bn9teffVVDBw4ECkpKUhPT8c555yDDz/80P3z9957D2PHjkX79u2RnJyM/Px83Hvvvaivr3e3+fWvf42lS5di9+7d7lSX3NxcrFq1CmeddRaAhkGb/jPPNKXPPvsMY8aMQVpaGlJSUnDuuefik08+8erjnDlz4HA4sG3bNlxxxRVIT0/H0KFDpa/N8OHDATQEGsGsWLECw4YNQ/PmzdGqVStcdNFF2L59u1d/7rjjDgBAXl6e+3Ht2rUr6P3+85//RP/+/dGsWTNkZmbi97//Pfbu3ev++a9//WtMmTIFAHDWWWfB4XAIrUq89tprSE9Px9ixY3HppZfitdde89uusrISkydPRmpqKlq1aoUpU6Zgy5Ytfvd5ff3117j00kuRkZGBpk2bYsCAAViyZEnQfpw8eRIZGRm46qqrfH5WXV2Npk2b4k9/+pP7tieffBI9e/Z0v/YGDBiA119/PeTjDSQnJwcvvfQSamtr8eCDD7pv97fHac2aNfi///s/dO7cGcnJyejUqRNuu+02HD9+3O9979y5E6NHj0bz5s3Rvn173HPPPdA0zauNy+XC448/jp49e6Jp06Zo164drr32Whw8eNDdJjc3F1999RX++9//ul83v/71r90/P3ToEP74xz+iU6dOSE5ORteuXTF//nyfVYY333wT/fv3R8uWLZGamorevXvjiSeeCHmN/L0H3n//fffrvWXLlhg7diy++uorn99999130atXLzRt2hS9evXCv/71L79/w98ep1WrVmHAgAFo2rQp8vPz8eyzz7rf141/96abbsJrr72Gnj17Ijk5GcXFxQCAvXv34uqrr0a7du2QnJyMnj174sUXX/T5+ydOnMDs2bPRtWtX93N755134sSJEyGvjyd//QOAl156yef9npubi9/85jdYu3YtBg4ciKZNm6JLly545ZVXfH7/0KFDuO2225Cbm4vk5GR07NgRV155JQ4cOBDyM9PfHqdjx47h9ttvd79mTj/9dDz88MM+r0/92urPo34N9etLRL644kQU43JyclBSUoKtW7eiV69eQdvOnTsXc+bMwdlnn4177rkHSUlJ+Oyzz7BixQqMGjUKQMMgoUWLFpg+fTpatGiBFStWYNasWaiursZDDz0EAPjLX/6Cw4cP44cffsBjjz0GAGjRogW6d++Oe+65B7NmzcI111yDYcOGAQDOPvtsAA0BygUXXID+/ftj9uzZcDqd+Mc//oHhw4djzZo1GDhwoFd//+///g+nnXYa7r//fp9BgYjS0lIAQOvWrQO2+fjjj3HBBRegS5cumDNnDo4fP44nn3wSQ4YMwaZNm5Cbm4uLL74Y3377Ld544w089thjyMzMBAC0adMm4P2+9NJLuOqqq3DWWWdh3rx52LdvH5544gl88skn+Pzzz9GqVSv85S9/wemnn47nnnsO99xzD/Ly8pCfnx/ycb322mu4+OKLkZSUhEmTJuGZZ57Bhg0b3AMwoGFQP27cOKxfvx7XX389unXrhvfee88dqHn66quvMGTIEHTo0AF33XUXmjdvjrfeegsTJkzAO++8g9/+9rd++5GYmIjf/va3WLx4MZ599lmvdKd3330XJ06ccKfQFRUV4ZZbbsGll16KW2+9FT///DO++OILfPbZZ7jiiitCPuZACgoKkJ+fj48++ihou3/+85+oqanB9ddfj9atW2P9+vV48skn8cMPP+Cf//ynV9v6+nqMGTMGgwcPxoMPPoji4mLMnj0bdXV1uOeee9ztrr32WvfzfMstt6CsrAxPPfUUPv/8c3zyySdITEzE448/jptvvhktWrTAX/7yFwBAu3btAAA1NTU499xzsXfvXlx77bXo3LkzPv30U8ycORPl5eXuPYQfffQRJk2ahPPPPx/z588HAGzfvh2ffPIJbr311qCPu/F7YOHChZgyZQpGjx6N+fPno6amBs888wyGDh2Kzz//3D1I//DDD3HJJZegR48emDdvHiorK3HVVVehY8eOIZ+Tzz//HGPGjEF2djbmzp2L+vp63HPPPQHfLytWrMBbb72Fm266CZmZmcjNzcW+ffswePBg9+C/TZs2eP/99zFt2jRUV1e7U4RdLhfGjx+PtWvX4pprrkH37t3x5Zdf4rHHHsO3336Ld9991+tvnTx5EgcOHPC6LSUlBSkpKSEfV2PfffcdLr30UkybNg1TpkzBiy++iKlTp6J///7o2bMnAODo0aMYNmwYtm/fjquvvhpnnnkmDhw4gCVLluCHH34I+ZnZmKZpGD9+PFauXIlp06ahb9+++OCDD3DHHXdg79697s9j3dq1a7F48WLccMMNaNmyJf7f//t/uOSSS7Bnz56gn4tEcUsjopj24YcfagkJCVpCQoJWUFCg3XnnndoHH3yg1dbWerXbsWOH5nQ6td/+9rdafX29189cLpf7/2tqanz+xrXXXqulpKRoP//8s/u2sWPHajk5OT5tN2zYoAHQ/vGPf/j8jdNOO00bPXq0z9/Ly8vTRo4c6b5t9uzZGgBt0qRJQtdg5cqVGgDtxRdf1H766Sftxx9/1JYuXarl5uZqDodD27Bhg6ZpmlZWVubTt759+2pt27bVKisr3bdt2bJFczqd2pVXXum
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Имеется смещение в меньшую сторону\n",
|
|||
|
"df_cleaned = df.dropna()\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"plt.scatter(df[\"Age\"], df[\"DiabetesPedigreeFunction\"])\n",
|
|||
|
"plt.xlabel(\"Age\")\n",
|
|||
|
"plt.ylabel(\"DiabetesPedigreeFunction\")\n",
|
|||
|
"plt.title(\"Scatter Plot of Age vs DiabetesPedigreeFunction\")\n",
|
|||
|
"plt.show()\n",
|
|||
|
"\n",
|
|||
|
"# уберем шумы\n",
|
|||
|
"\n",
|
|||
|
"# Статистический анализ для определения выбросов\n",
|
|||
|
"Q1 = df[\"DiabetesPedigreeFunction\"].quantile(0.25)\n",
|
|||
|
"Q3 = df[\"DiabetesPedigreeFunction\"].quantile(0.75)\n",
|
|||
|
"IQR = Q3 - Q1\n",
|
|||
|
"\n",
|
|||
|
"# Определение порога для выбросов\n",
|
|||
|
"threshold = 1.5 * IQR\n",
|
|||
|
"outliers = (df[\"DiabetesPedigreeFunction\"] < (Q1 - threshold)) | (\n",
|
|||
|
" df[\"DiabetesPedigreeFunction\"] > (Q3 + threshold)\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"# Вывод выбросов\n",
|
|||
|
"print(\"Выбросы:\")\n",
|
|||
|
"print(df[outliers])\n",
|
|||
|
"\n",
|
|||
|
"# Обработка выбросов\n",
|
|||
|
"# В данном случае мы уберем выбросы\n",
|
|||
|
"median_charge = df[\"DiabetesPedigreeFunction\"].median()\n",
|
|||
|
"df.loc[outliers, \"DiabetesPedigreeFunction\"] = 0\n",
|
|||
|
"df = df[df.DiabetesPedigreeFunction != 0]\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация данных после обработки\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"plt.scatter(df[\"Age\"], df[\"DiabetesPedigreeFunction\"])\n",
|
|||
|
"plt.xlabel(\"Age\")\n",
|
|||
|
"plt.ylabel(\"DiabetesPedigreeFunction\")\n",
|
|||
|
"plt.title(\"Scatter Plot of Age vs DiabetesPedigreeFunction\")\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Разбиение набора данных на обучающую, контрольную и тестовую выборки"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 117,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки: 460\n",
|
|||
|
"Размер контрольной выборки: 154\n",
|
|||
|
"Размер тестовой выборки: 154\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"\n",
|
|||
|
"# Разделение на обучающую и тестовую выборки\n",
|
|||
|
"train_df, test_df = train_test_split(df_cleaned, test_size=0.2, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение обучающей выборки на обучающую и контрольную\n",
|
|||
|
"train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"print(\"Размер обучающей выборки:\", len(train_df))\n",
|
|||
|
"print(\"Размер контрольной выборки:\", len(val_df))\n",
|
|||
|
"print(\"Размер тестовой выборки:\", len(test_df))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Видим недостаток баланса:"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 118,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение Age в обучающей выборке:\n",
|
|||
|
"Age\n",
|
|||
|
"22 39\n",
|
|||
|
"21 36\n",
|
|||
|
"25 31\n",
|
|||
|
"24 25\n",
|
|||
|
"26 23\n",
|
|||
|
"27 22\n",
|
|||
|
"28 21\n",
|
|||
|
"23 21\n",
|
|||
|
"41 17\n",
|
|||
|
"31 15\n",
|
|||
|
"29 15\n",
|
|||
|
"37 14\n",
|
|||
|
"30 12\n",
|
|||
|
"45 11\n",
|
|||
|
"46 11\n",
|
|||
|
"36 11\n",
|
|||
|
"42 10\n",
|
|||
|
"34 10\n",
|
|||
|
"38 10\n",
|
|||
|
"32 9\n",
|
|||
|
"35 8\n",
|
|||
|
"52 8\n",
|
|||
|
"33 7\n",
|
|||
|
"43 7\n",
|
|||
|
"40 7\n",
|
|||
|
"47 5\n",
|
|||
|
"50 5\n",
|
|||
|
"51 5\n",
|
|||
|
"39 4\n",
|
|||
|
"48 4\n",
|
|||
|
"57 4\n",
|
|||
|
"66 3\n",
|
|||
|
"49 3\n",
|
|||
|
"44 3\n",
|
|||
|
"58 2\n",
|
|||
|
"55 2\n",
|
|||
|
"53 2\n",
|
|||
|
"63 2\n",
|
|||
|
"67 2\n",
|
|||
|
"54 2\n",
|
|||
|
"56 2\n",
|
|||
|
"60 2\n",
|
|||
|
"59 2\n",
|
|||
|
"61 1\n",
|
|||
|
"65 1\n",
|
|||
|
"81 1\n",
|
|||
|
"69 1\n",
|
|||
|
"68 1\n",
|
|||
|
"70 1\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение Age в контрольной выборке:\n",
|
|||
|
"Age\n",
|
|||
|
"22 20\n",
|
|||
|
"21 15\n",
|
|||
|
"24 11\n",
|
|||
|
"23 10\n",
|
|||
|
"25 9\n",
|
|||
|
"33 8\n",
|
|||
|
"26 7\n",
|
|||
|
"31 6\n",
|
|||
|
"30 5\n",
|
|||
|
"28 5\n",
|
|||
|
"27 5\n",
|
|||
|
"39 4\n",
|
|||
|
"29 4\n",
|
|||
|
"42 4\n",
|
|||
|
"40 4\n",
|
|||
|
"32 3\n",
|
|||
|
"37 2\n",
|
|||
|
"46 2\n",
|
|||
|
"51 2\n",
|
|||
|
"44 2\n",
|
|||
|
"34 2\n",
|
|||
|
"43 2\n",
|
|||
|
"41 2\n",
|
|||
|
"45 2\n",
|
|||
|
"54 2\n",
|
|||
|
"35 2\n",
|
|||
|
"58 1\n",
|
|||
|
"53 1\n",
|
|||
|
"61 1\n",
|
|||
|
"59 1\n",
|
|||
|
"64 1\n",
|
|||
|
"36 1\n",
|
|||
|
"50 1\n",
|
|||
|
"72 1\n",
|
|||
|
"49 1\n",
|
|||
|
"47 1\n",
|
|||
|
"66 1\n",
|
|||
|
"62 1\n",
|
|||
|
"69 1\n",
|
|||
|
"55 1\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение Age в тестовой выборке:\n",
|
|||
|
"Age\n",
|
|||
|
"22 13\n",
|
|||
|
"21 12\n",
|
|||
|
"24 10\n",
|
|||
|
"29 10\n",
|
|||
|
"28 9\n",
|
|||
|
"25 8\n",
|
|||
|
"23 7\n",
|
|||
|
"38 6\n",
|
|||
|
"27 5\n",
|
|||
|
"39 4\n",
|
|||
|
"30 4\n",
|
|||
|
"43 4\n",
|
|||
|
"42 4\n",
|
|||
|
"32 4\n",
|
|||
|
"36 4\n",
|
|||
|
"58 4\n",
|
|||
|
"31 3\n",
|
|||
|
"62 3\n",
|
|||
|
"44 3\n",
|
|||
|
"26 3\n",
|
|||
|
"41 3\n",
|
|||
|
"37 3\n",
|
|||
|
"60 3\n",
|
|||
|
"54 2\n",
|
|||
|
"50 2\n",
|
|||
|
"34 2\n",
|
|||
|
"65 2\n",
|
|||
|
"63 2\n",
|
|||
|
"45 2\n",
|
|||
|
"40 2\n",
|
|||
|
"33 2\n",
|
|||
|
"53 2\n",
|
|||
|
"55 1\n",
|
|||
|
"48 1\n",
|
|||
|
"67 1\n",
|
|||
|
"56 1\n",
|
|||
|
"51 1\n",
|
|||
|
"49 1\n",
|
|||
|
"57 1\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"def check_balance(df, name):\n",
|
|||
|
" counts = df['Age'].value_counts()\n",
|
|||
|
" print(f\"Распределение Age в {name}:\")\n",
|
|||
|
" print(counts)\n",
|
|||
|
" print()\n",
|
|||
|
"\n",
|
|||
|
"check_balance(train_df, \"обучающей выборке\")\n",
|
|||
|
"check_balance(val_df, \"контрольной выборке\")\n",
|
|||
|
"check_balance(test_df, \"тестовой выборке\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Используем oversample"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 119,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение Age в обучающей выборке после oversampling:\n",
|
|||
|
"Age\n",
|
|||
|
"26 39\n",
|
|||
|
"25 39\n",
|
|||
|
"33 39\n",
|
|||
|
"23 39\n",
|
|||
|
"35 39\n",
|
|||
|
"39 39\n",
|
|||
|
"29 39\n",
|
|||
|
"27 39\n",
|
|||
|
"21 39\n",
|
|||
|
"28 39\n",
|
|||
|
"41 39\n",
|
|||
|
"31 39\n",
|
|||
|
"24 39\n",
|
|||
|
"22 39\n",
|
|||
|
"42 39\n",
|
|||
|
"32 39\n",
|
|||
|
"36 39\n",
|
|||
|
"51 39\n",
|
|||
|
"34 39\n",
|
|||
|
"30 39\n",
|
|||
|
"45 39\n",
|
|||
|
"49 39\n",
|
|||
|
"57 39\n",
|
|||
|
"53 39\n",
|
|||
|
"48 39\n",
|
|||
|
"55 39\n",
|
|||
|
"46 39\n",
|
|||
|
"47 39\n",
|
|||
|
"52 39\n",
|
|||
|
"50 39\n",
|
|||
|
"37 39\n",
|
|||
|
"58 39\n",
|
|||
|
"61 39\n",
|
|||
|
"40 39\n",
|
|||
|
"65 39\n",
|
|||
|
"67 39\n",
|
|||
|
"38 39\n",
|
|||
|
"66 39\n",
|
|||
|
"81 39\n",
|
|||
|
"44 39\n",
|
|||
|
"43 39\n",
|
|||
|
"56 39\n",
|
|||
|
"54 39\n",
|
|||
|
"60 39\n",
|
|||
|
"69 39\n",
|
|||
|
"70 39\n",
|
|||
|
"68 39\n",
|
|||
|
"63 39\n",
|
|||
|
"59 39\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение Age в контрольной выборке после oversampling:\n",
|
|||
|
"Age\n",
|
|||
|
"25 20\n",
|
|||
|
"58 20\n",
|
|||
|
"27 20\n",
|
|||
|
"26 20\n",
|
|||
|
"31 20\n",
|
|||
|
"22 20\n",
|
|||
|
"28 20\n",
|
|||
|
"39 20\n",
|
|||
|
"21 20\n",
|
|||
|
"45 20\n",
|
|||
|
"41 20\n",
|
|||
|
"61 20\n",
|
|||
|
"51 20\n",
|
|||
|
"64 20\n",
|
|||
|
"36 20\n",
|
|||
|
"30 20\n",
|
|||
|
"53 20\n",
|
|||
|
"40 20\n",
|
|||
|
"23 20\n",
|
|||
|
"59 20\n",
|
|||
|
"29 20\n",
|
|||
|
"42 20\n",
|
|||
|
"46 20\n",
|
|||
|
"43 20\n",
|
|||
|
"44 20\n",
|
|||
|
"34 20\n",
|
|||
|
"24 20\n",
|
|||
|
"32 20\n",
|
|||
|
"33 20\n",
|
|||
|
"72 20\n",
|
|||
|
"35 20\n",
|
|||
|
"50 20\n",
|
|||
|
"54 20\n",
|
|||
|
"47 20\n",
|
|||
|
"49 20\n",
|
|||
|
"66 20\n",
|
|||
|
"62 20\n",
|
|||
|
"69 20\n",
|
|||
|
"55 20\n",
|
|||
|
"37 20\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение Age в тестовой выборке после oversampling:\n",
|
|||
|
"Age\n",
|
|||
|
"43 13\n",
|
|||
|
"21 13\n",
|
|||
|
"34 13\n",
|
|||
|
"50 13\n",
|
|||
|
"55 13\n",
|
|||
|
"22 13\n",
|
|||
|
"44 13\n",
|
|||
|
"37 13\n",
|
|||
|
"65 13\n",
|
|||
|
"40 13\n",
|
|||
|
"60 13\n",
|
|||
|
"29 13\n",
|
|||
|
"28 13\n",
|
|||
|
"24 13\n",
|
|||
|
"36 13\n",
|
|||
|
"48 13\n",
|
|||
|
"45 13\n",
|
|||
|
"39 13\n",
|
|||
|
"41 13\n",
|
|||
|
"27 13\n",
|
|||
|
"23 13\n",
|
|||
|
"38 13\n",
|
|||
|
"25 13\n",
|
|||
|
"67 13\n",
|
|||
|
"31 13\n",
|
|||
|
"30 13\n",
|
|||
|
"54 13\n",
|
|||
|
"33 13\n",
|
|||
|
"62 13\n",
|
|||
|
"26 13\n",
|
|||
|
"56 13\n",
|
|||
|
"58 13\n",
|
|||
|
"53 13\n",
|
|||
|
"63 13\n",
|
|||
|
"42 13\n",
|
|||
|
"32 13\n",
|
|||
|
"51 13\n",
|
|||
|
"49 13\n",
|
|||
|
"57 13\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from imblearn.over_sampling import RandomOverSampler\n",
|
|||
|
"\n",
|
|||
|
"def oversample(df):\n",
|
|||
|
" X = df.drop('Age', axis=1)\n",
|
|||
|
" y = df['Age']\n",
|
|||
|
" \n",
|
|||
|
" oversampler = RandomOverSampler(random_state=42)\n",
|
|||
|
" X_resampled, y_resampled = oversampler.fit_resample(X, y) # type: ignore\n",
|
|||
|
" \n",
|
|||
|
" resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
|
|||
|
" return resampled_df\n",
|
|||
|
"\n",
|
|||
|
"train_df_oversampled = oversample(train_df)\n",
|
|||
|
"val_df_oversampled = oversample(val_df)\n",
|
|||
|
"test_df_oversampled = oversample(test_df)\n",
|
|||
|
"\n",
|
|||
|
"check_balance(train_df_oversampled, \"обучающей выборке после oversampling\")\n",
|
|||
|
"check_balance(val_df_oversampled, \"контрольной выборке после oversampling\")\n",
|
|||
|
"check_balance(test_df_oversampled, \"тестовой выборке после oversampling\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Diamonds Prices2022"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"В данном наборе данных представлена цена на алмазы. Входные данные: цвет, вес (в каратах), цена, чистота, габариты. Цель: узнать, насколько дорого обходятся девушкам их друзья (Мэрилин Монро)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 120,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Index(['Unnamed: 0', 'carat', 'cut', 'color', 'clarity', 'depth', 'table',\n",
|
|||
|
" 'price', 'x', 'y', 'z'],\n",
|
|||
|
" dtype='object')\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"df = pd.read_csv(\".//static//csv//Diamonds Prices2022.csv\")\n",
|
|||
|
"print(df.columns)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 121,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAx8AAAIjCAYAAABia6bHAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAsLUlEQVR4nO3deZTXdd3//8fMIIs6DBeC4CQiuG8jKegxtUtLyzWzNDVKze1kmmtaHBWwy6wrvC6t3FvEvlZ2hVu573K8wD0SLddQKxBFZVUWmc/vD38zFyPbgONrGLjdzpkj8/m8l+f7M5/jmft83u/Pp6pSqVQCAADwMatu7wEAAIA1g/gAAACKEB8AAEAR4gMAAChCfAAAAEWIDwAAoAjxAQAAFCE+AACAIsQHAABQhPgAAACKEB8Aq5FXXnklVVVVGT16dHuPAgCLER8ASzB69OhUVVW1+Fp//fWz55575o477ig+z4MPPthilrXWWisDBw7MkUcemb///e9tso9x48Zl5MiRmT59eptsj+Tdd9/NyJEj8+CDD7b3KACrhE7tPQDAquz73/9+BgwYkEqlkqlTp2b06NHZb7/98qc//SkHHHBA8XlOOeWUDBkyJAsWLMhTTz2Vq6++OrfddlsmTpyY+vr6j7TtcePG5fzzz8/RRx+dHj16tM3Aa7h33303559/fpJkjz32aN9hAFYB4gNgGfbdd98MHjy4+ftjjz02ffr0ye9+97t2iY/dd989hxxySJLkG9/4RjbffPOccsopufbaazNs2LDi86xp3n///TQ2NqZz587tPQpAh+S0K4AV0KNHj3Tr1i2dOrX8282cOXNy5plnpl+/funSpUu22GKLXHTRRalUKkmS9957L1tuuWW23HLLvPfee83rvf3229lggw3yqU99KgsXLlzheT7zmc8kSSZNmrTM5e6///7svvvuWWedddKjR48cdNBB+dvf/tZ8/8iRI3PWWWclSQYMGNB8etcrr7yy3Bk+fHpa09eSTjU6+uijl7jsyJEjWyw3ZsyYDB48OLW1tS2Wu+iii5Y7z/Tp03P66adn4403TpcuXbLhhhvmyCOPzLRp05Ik8+fPz/Dhw7Pjjjumrq4u66yzTnbfffc88MADLbbTdP3MRRddlEsuuSSbbLJJunTpkr/+9a+t2sYrr7yS3r17J0nOP//8pR4rwJrEKx8AyzBjxoxMmzYtlUolb7zxRn72s59l9uzZ+drXvta8TKVSyRe+8IU88MADOfbYYzNo0KDcddddOeuss/Kvf/0rF198cbp165Zrr702u+66a84555z893//d5LkpJNOyowZMzJ69OjU1NSs8Hwvv/xykmS99dZb6jL33ntv9t133wwcODAjR47Me++9l5/97GfZdddd89RTT2XjjTfOl770pbzwwgv53e9+l4svvji9evVKkuZfnpdn7733zpFHHpkkefzxx/PTn/50qcv26tUrF198cfP3X//611vcP378+HzlK1/J9ttvnx/96Eepq6vLtGnTcvrppy93jtmzZ2f33XfP3/72txxzzDHZYYcdMm3atPzxj3/MP//5z/Tq1SszZ87ML37xixxxxBE5/vjjM2vWrPzyl7/M5z//+Tz22GMZNGhQi21ec801mTt3bk444YR06dIlPXv2bNU2evfunSuuuCInnnhiDj744HzpS19KkjQ0NLTqMQVYLVUAWMw111xTSbLYV5cuXSqjR49usezNN99cSVK54IILWtx+yCGHVKqqqiovvfRS823Dhg2rVFdXV8aOHVv5wx/+UElSueSSS5Y7zwMPPFBJUvnVr35VefPNNyuTJ0+u3HbbbZWNN964UlVVVXn88ccrlUqlMmnSpEqSyjXXXNO87qBBgyrrr79+5a233mq+7S9/+Uulurq6cuSRRzbfNmrUqEqSyqRJk1r9OM2fP7+SpHLyySc339Z0XA888MBiyw8dOrQyYMCAFrclqYwYMaL5+2HDhlWSVKZMmdJ8W9NxjRo1apnzDB8+vJKkcuONNy52X2NjY6VSqVTef//9yrx581rc984771T69OlTOeaYYxbbZ/fu3StvvPFGi+Vbu40333xzseMDWJN55QNgGS677LJsvvnmSZKpU6fmuuuuy3HHHZfa2trmv2TffvvtqampySmnnNJi3TPPPDNjxozJHXfckZNPPjnJB6c33XrrrTnqqKMye/bs/Pu///ti6y3LMccc0+L73r1759prr21xXcqipkyZkgkTJuTss89Oz549m29vaGjI3nvvndtvv73V+16SuXPnJkm6du3aquXnz5+fLl26LHOZWbNmpbq6eqUuer/hhhuy/fbb5+CDD17svqqqqiRJTU1N86tMjY2NmT59ehobGzN48OA89dRTi6335S9/ebFXgFZ0GwB8wDUfAMuw0047Za+99spee+2VoUOH5rbbbsvWW2+dk08+OfPnz0+SvPrqq6mvr09tbW2Ldbfaaqvm+5t07tw5v/rVrzJp0qTMmjUr11xzTfMvxa0xfPjw3HPPPbn//vvz9NNPZ/LkyYudtrSopn1vscUWi9231VZbZdq0aZkzZ06r9/9hTddR1NXVtWr56dOnZ911113mMrvssksaGxtz6qmn5uWXX860adPyzjvvtGr7L7/8crbddtvlLnfttdemoaEhXbt2zXrrrZfevXvntttuy4wZMxZbdsCAAR95GwB8QHwArIDq6ursueeemTJlSl588cWV2sZdd92V5INXDVZ0G9ttt1322muv7Lnnntluu+0Wu/C9tKYL0jfeeONWLf/666+nb9++y1zm8MMPz5lnnpnRo0dn0003Te/evbPDDjt8xEn/z3XXXZejjz46m2yySX75y1/mzjvvzD333JPPfOYzaWxsXGz5bt26feRtAPABp10BrKD3338/yQcXNydJ//79c++992bWrFktXv147rnnmu9v8vTTT+f73/9+vvGNb2TChAk57rjjMnHixFa/crCimvb9/PPPL3bfc889l169emWdddZJkhV6BabJE088kSRLPe1rUQsWLMhLL72UffbZZ5nLVVdX56KLLsrEiRMzadKkXH755Zk6dWqLi/yXZpNNNskzzzyzzGXGjBmTgQMH5sYbb2xxzCNGjFju9ld0GyvzmAKszrzyAbACFixYkLvvvjudO3duPq1qv/32y8KFC3PppZe2WPbiiy9OVVVV9t133+Z1jz766NTX1+cnP/lJRo8enalTp7bqXZxW1gYbbJBBgwbl2muvbfHJ5c8880zuvvvu7Lfffs23NUXIinzC+ZgxY7LFFltkyy23XO6yt9xyS957773mtwdelp/97Ge5//7785vf/CZ77bVXdt1111bN8+Uvfzl/+ctfctNNNy12X+X/f9vjpms1mr5PkkcffTTjx49v1T5WZBtrr712khV7TAFWZ175AFiGO+64o/kVjDfeeCO//e1v8+KLL+Z73/teunfvniQ58MADs+eee+acc87JK6+8ku233z533313brnllpx22mnZZJNNkiQXXHBBJkyYkPvuuy+1tbVpaGjI8OHDc+655+aQQw5pEQJtadSoUdl3332zyy675Nhjj21+q926uroWnzmx4447JknOOeecHH744VlrrbVy4IEHNkfJov7+97/nxz/+cR577LF86UtfynXXXdd83+OPP54kueeee7LRRhulb9++GTFiRC6//PJ86lOfyuc+97llzvvss8/m7LPPzsiRIzNkyJAVOtazzjorY8aMyaGHHppjjjkmO+64Y95+++388Y9/zJVXXpntt98+BxxwQG688cYcfPDB2X///TNp0qRceeWV2XrrrZtfzVqe1m6jW7du2XrrrfP73/8+m2++eXr27Jltt922VdelAKyW2vndtgBWSUt6q92uXbtWBg0aVLniiiua37a1yaxZsyqnn356pb6+vrLWWmtVNttss8qoUaOal3vyyScrnTp1qnz7299usd77779fGTJkSKW+vr7yzjvvLHWeprfa/cMf/rDMuZf0VruVSqVy7733VnbddddKt27dKt27d68ceOCBlb/+9a+Lrf8f//EflU9
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"sns.boxplot(x=df['carat'])\n",
|
|||
|
"plt.title('Box Plot для carat')\n",
|
|||
|
"plt.xlabel('carat')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 122,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA0EAAAIjCAYAAADFthA8AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACr0klEQVR4nOzdeXwU9f0/8Nfs5tgkJJuLsAlCEg6FGAFBEOSwIrQc4lWrolCvoiJYr/araBEoVaS2VX/VokVBC6K29UAEY7mUw2CUcMWgQkwAIQGSwCbkTnZ+f8RZ95jdndmdvbKv5+Ph4yGb2ZnPzs7uft7z/nzeH0EURRFEREREREQRQhfsBhAREREREQUSgyAiIiIiIoooDIKIiIiIiCiiMAgiIiIiIqKIwiCIiIiIiIgiCoMgIiIiIiKKKAyCiIiIiIgoojAIIiIiIiKiiMIgiIiIiIiIIgqDICIicqmiogKCIOD1118PdlPsFBQUYMiQITAYDBAEAWfPnvXbsW6//Xbk5OT4bf9ERBR4DIKIKCIdOHAAN9xwA7Kzs2EwGNCzZ09MnDgRf//73/12zDVr1uD55593evzEiRNYuHAh9u7d67djO/r0008hCIL1v+joaPTp0we//vWv8f3332tyjM8//xwLFy7UPECpqanBjTfeiLi4OLz00ktYtWoVEhISZLd9/fXX7V6nwWDA+eefj7lz5+LkyZOatisc7d27FzNmzECvXr0QGxuL1NRUTJgwAStXrkRHR0dQ2vT000/jgw8+CMqxiShyRAW7AUREgfb555/jiiuuQO/evTFr1iyYTCYcO3YMu3btwgsvvID777/fL8dds2YNSkpK8OCDD9o9fuLECSxatAg5OTkYMmSIX47tym9/+1sMHz4cbW1tKC4uxj//+U+sX78eBw4cQFZWlk/7/vzzz7Fo0SLcfvvtSE5O1qbBAL788kvU19dj8eLFmDBhgqLn/PGPf0Rubi6am5uxY8cOLFu2DBs2bEBJSQni4+PdPnf58uWwWCxaND2kvPrqq7j33nvRo0cPzJw5E/3790d9fT02b96Mu+66C5WVlXj88ccD3q6nn34aN9xwA6699tqAH5uIIgeDICKKOE899RSMRiO+/PJLp875qVOngtMoP2hoaHCZIZGMHTsWN9xwAwDgjjvuwPnnn4/f/va3eOONNzBv3rxANFM16T1SE1hNnjwZl1xyCQDgN7/5DdLS0vC3v/0Na9euxfTp02WfI52/6Ohon9scanbt2oV7770Xo0aNwoYNG5CYmGj924MPPoivvvoKJSUlPh/HYrGgtbUVBoPB530REWmJw+GIKOKUlZXhwgsvlO1EZ2RkOD22evVqjBgxAvHx8UhJScG4cePwv//9z/r3tWvXYurUqcjKykJsbCz69u2LxYsX2w0n+tnPfob169fjyJEj1qFZOTk5+PTTTzF8+HAAnUGI9DfbOThffPEFJk2aBKPRiPj4eFx++eXYuXOnXRsXLlwIQRBQWlqKW265BSkpKRgzZozqczN+/HgAQHl5udvttmzZgrFjxyIhIQHJycm45pprcPDgQbv2/P73vwcA5ObmWl9XRUWF2/3+5z//wbBhwxAXF4f09HTMmDEDx48ft/79Zz/7GW677TYAwPDhwyEIAm6//XafX+ftt9+Obt26oaysDFOmTEFiYiJuvfVW698c5wRZLBa88MILuOiii2AwGNC9e3dMmjQJX331ld12q1evtr6e1NRU3HzzzTh27Jjbtv33v/+FIAj47LPPnP72yiuvQBAEa4BSVVWFO+64A+eddx5iY2ORmZmJa665xuN5XrRoEQRBwJtvvmkXAEkuueQSu/P6l7/8BZdddhnS0tIQFxeHYcOG4b///a/T8wRBwNy5c/Hmm2/iwgsvRGxsLAoKChTvQxAENDQ04I033rBeM968v0REnjATREQRJzs7G4WFhSgpKUF+fr7bbRctWoSFCxfisssuwx//+EfExMTgiy++wJYtW/Dzn/8cQOe8k27duuHhhx9Gt27dsGXLFjz55JOoq6vDs88+CwB44oknYDab8cMPP+C5554DAHTr1g0DBw7EH//4Rzz55JO4++67MXbsWADAZZddBqAz2Jg8eTKGDRuGBQsWQKfTYeXKlRg/fjy2b9+OESNG2LX3V7/6Ffr374+nn34aoiiqPjdlZWUAgLS0NJfbbNq0CZMnT0afPn2wcOFCNDU14e9//ztGjx6N4uJi5OTk4Prrr8d3332Ht956C8899xzS09MBAN27d3e539dffx133HEHhg8fjiVLluDkyZN44YUXsHPnTuzZswfJycl44okncMEFF+Cf//yndYhb3759NXmd7e3t+MUvfoExY8bgL3/5i9thcnfddRdef/11TJ48Gb/5zW/Q3t6O7du3Y9euXdaM01NPPYX58+fjxhtvxG9+8xucPn0af//73zFu3Djr65EzdepUdOvWDf/+979x+eWX2/3tnXfewYUXXmi9bn/5y1/i66+/xv3334+cnBycOnUKGzduxNGjR10Wc2hsbMTmzZsxbtw49O7dW9H5euGFF3D11Vfj1ltvRWtrK95++2386le/wkcffYSpU6fabbtlyxb8+9//xty5c5Genm5th5J9rFq1Cr/5zW8wYsQI3H333QDg1ftLROSRSEQUYf73v/+Jer1e1Ov14qhRo8T/+7//Ez/55BOxtbXVbrtDhw6JOp1OvO6668SOjg67v1ksFuv/NzY2Oh3jnnvuEePj48Xm5mbrY1OnThWzs7Odtv3yyy9FAOLKlSudjtG/f3/xF7/4hdPxcnNzxYkTJ1ofW7BggQhAnD59uqJzsHXrVhGAuGLFCvH06dPiiRMnxPXr14s5OTmiIAjil19+KYqiKJaXlzu1bciQIWJGRoZYU1NjfWzfvn2iTqcTf/3rX1sfe/bZZ0UAYnl5ucf2tLa2ihkZGWJ+fr7Y1NRkffyjjz4SAYhPPvmk9bGVK1eKAKxtdEfadtOmTeLp06fFY8eOiW+//baYlpYmxsXFiT/88IMoiqJ42223iQDExx57zGkft912m937tmXLFhGA+Nvf/tZpW+l9qqioEPV6vfjUU0/Z/f3AgQNiVFSU0+OOpk+fLmZkZIjt7e3WxyorK0WdTif+8Y9/FEVRFM+cOSMCEJ999lmP58HWvn37RADiAw88oPg5jtd4a2urmJ+fL44fP97ucQCiTqcTv/76a6/3kZCQIN52222K20ZE5A0OhyOiiDNx4kQUFhbi6quvxr59+/DnP/8Zv/jFL9CzZ098+OGH1u0++OADWCwWPPnkk9Dp7L8uBUGw/n9cXJz1/+vr61FdXY2xY8eisbER33zzjdft3Lt3Lw4dOoRbbrkFNTU1qK6uRnV1NRoaGnDllVdi27ZtThP27733XlXHuPPOO9G9e3dkZWVh6tSp1qFIUjbDUWVlJfbu3Yvbb78dqamp1scHDRqEiRMnYsOGDepfKICvvvoKp06dwn333Wc3f2Tq1KkYMGAA1q9f79V+JRMmTED37t3Rq1cv3HzzzejWrRvef/999OzZ02672bNne9zXu+++C0EQsGDBAqe/SdfFe++9B4vFghtvvNH6vlVXV8NkMqF///7YunWr22PcdNNNOHXqFD799FPrY//9739hsVhw0003Aei87mJiYvDpp5/izJkzHtstqaurAwDZYXCu2F7jZ86cgdlsxtixY1FcXOy07eWXX468vDyf9kFE5G8cDkdEEWn48OF477330Nrain379uH999/Hc889hxtuuAF79+5FXl4eysrKoNPpZDt0tr7++mv84Q9/wJYtW6wdTInZbPa6jYcOHQIA6xwYOWazGSkpKdZ/5+bmqjrGk08+ibFjx0Kv1yM9PR0DBw5EVJTrn4YjR44AAC644AKnvw0cOBCffPKJooIMavY7YMAA7NixQ9X+HL300ks4//zzERUVhR49euCCCy5wCmyjoqJw3nnnedxXWVkZsrKy7IJAR4cOHYIoiujfv7/s3z0VW5DmgL3zzju48sorAXQOhRsyZAjOP/98AEBsbCyWLl2KRx55BD169MDIkSN
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация данных после обработки\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"plt.scatter(df[\"price\"], df[\"carat\"])\n",
|
|||
|
"plt.xlabel(\"price\")\n",
|
|||
|
"plt.ylabel(\"carat\")\n",
|
|||
|
"plt.title(\"Scatter Plot of Price vs Carat\")\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Удаление строк с пустыми значениями"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 123,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"df_cleaned = df.dropna()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Разбиение набора данных на обучающую, контрольную и тестовую выборки"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Применение методов приращения данных (аугментации)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 124,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки: 32365\n",
|
|||
|
"Размер контрольной выборки: 10789\n",
|
|||
|
"Размер тестовой выборки: 10789\n",
|
|||
|
"Распределение price в обучающей выборке:\n",
|
|||
|
"price\n",
|
|||
|
"789 80\n",
|
|||
|
"605 79\n",
|
|||
|
"544 72\n",
|
|||
|
"552 72\n",
|
|||
|
"828 71\n",
|
|||
|
" ..\n",
|
|||
|
"9942 1\n",
|
|||
|
"7787 1\n",
|
|||
|
"18663 1\n",
|
|||
|
"7979 1\n",
|
|||
|
"8164 1\n",
|
|||
|
"Name: count, Length: 9496, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение price в контрольной выборке:\n",
|
|||
|
"price\n",
|
|||
|
"625 34\n",
|
|||
|
"828 31\n",
|
|||
|
"605 30\n",
|
|||
|
"789 26\n",
|
|||
|
"544 26\n",
|
|||
|
" ..\n",
|
|||
|
"4188 1\n",
|
|||
|
"7541 1\n",
|
|||
|
"3498 1\n",
|
|||
|
"3314 1\n",
|
|||
|
"12196 1\n",
|
|||
|
"Name: count, Length: 5383, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение price в тестовой выборке:\n",
|
|||
|
"price\n",
|
|||
|
"802 33\n",
|
|||
|
"844 29\n",
|
|||
|
"776 29\n",
|
|||
|
"675 26\n",
|
|||
|
"645 25\n",
|
|||
|
" ..\n",
|
|||
|
"1567 1\n",
|
|||
|
"5529 1\n",
|
|||
|
"2031 1\n",
|
|||
|
"417 1\n",
|
|||
|
"5431 1\n",
|
|||
|
"Name: count, Length: 5338, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение price в обучающей выборке после oversampling:\n",
|
|||
|
"price\n",
|
|||
|
"5076 80\n",
|
|||
|
"1789 80\n",
|
|||
|
"3931 80\n",
|
|||
|
"1263 80\n",
|
|||
|
"2026 80\n",
|
|||
|
" ..\n",
|
|||
|
"3678 80\n",
|
|||
|
"4592 80\n",
|
|||
|
"516 80\n",
|
|||
|
"7152 80\n",
|
|||
|
"2353 80\n",
|
|||
|
"Name: count, Length: 9496, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение price в контрольной выборке после oversampling:\n",
|
|||
|
"price\n",
|
|||
|
"966 34\n",
|
|||
|
"13638 34\n",
|
|||
|
"3669 34\n",
|
|||
|
"1052 34\n",
|
|||
|
"2818 34\n",
|
|||
|
" ..\n",
|
|||
|
"4032 34\n",
|
|||
|
"544 34\n",
|
|||
|
"3362 34\n",
|
|||
|
"6559 34\n",
|
|||
|
"792 34\n",
|
|||
|
"Name: count, Length: 5383, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение price в тестовой выборке после oversampling:\n",
|
|||
|
"price\n",
|
|||
|
"3742 33\n",
|
|||
|
"559 33\n",
|
|||
|
"8403 33\n",
|
|||
|
"1238 33\n",
|
|||
|
"1243 33\n",
|
|||
|
" ..\n",
|
|||
|
"1149 33\n",
|
|||
|
"2401 33\n",
|
|||
|
"958 33\n",
|
|||
|
"702 33\n",
|
|||
|
"14618 33\n",
|
|||
|
"Name: count, Length: 5338, dtype: int64\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from imblearn.over_sampling import RandomOverSampler\n",
|
|||
|
"\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"\n",
|
|||
|
"# Разделение на обучающую и тестовую выборки\n",
|
|||
|
"train_df, test_df = train_test_split(df_cleaned, test_size=0.2, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение обучающей выборки на обучающую и контрольную\n",
|
|||
|
"train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"print(\"Размер обучающей выборки:\", len(train_df))\n",
|
|||
|
"print(\"Размер контрольной выборки:\", len(val_df))\n",
|
|||
|
"print(\"Размер тестовой выборки:\", len(test_df))\n",
|
|||
|
"\n",
|
|||
|
"def check_balance(df, name):\n",
|
|||
|
" counts = df[\"price\"].value_counts()\n",
|
|||
|
" print(f\"Распределение price в {name}:\")\n",
|
|||
|
" print(counts)\n",
|
|||
|
" print()\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"check_balance(train_df, \"обучающей выборке\")\n",
|
|||
|
"check_balance(val_df, \"контрольной выборке\")\n",
|
|||
|
"check_balance(test_df, \"тестовой выборке\")\n",
|
|||
|
"\n",
|
|||
|
"def oversample(df):\n",
|
|||
|
" X = df.drop(\"price\", axis=1)\n",
|
|||
|
" y = df[\"price\"]\n",
|
|||
|
"\n",
|
|||
|
" oversampler = RandomOverSampler(random_state=42)\n",
|
|||
|
" X_resampled, y_resampled = oversampler.fit_resample(X, y) # type: ignore\n",
|
|||
|
"\n",
|
|||
|
" resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
|
|||
|
" return resampled_df\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"train_df_oversampled = oversample(train_df)\n",
|
|||
|
"val_df_oversampled = oversample(val_df)\n",
|
|||
|
"test_df_oversampled = oversample(test_df)\n",
|
|||
|
"\n",
|
|||
|
"check_balance(train_df_oversampled, \"обучающей выборке после oversampling\")\n",
|
|||
|
"check_balance(val_df_oversampled, \"контрольной выборке после oversampling\")\n",
|
|||
|
"check_balance(test_df_oversampled, \"тестовой выборке после oversampling\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 125,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение price в обучающей выборке после oversampling:\n",
|
|||
|
"price\n",
|
|||
|
"5076 80\n",
|
|||
|
"1789 80\n",
|
|||
|
"3931 80\n",
|
|||
|
"1263 80\n",
|
|||
|
"2026 80\n",
|
|||
|
" ..\n",
|
|||
|
"3678 80\n",
|
|||
|
"4592 80\n",
|
|||
|
"516 80\n",
|
|||
|
"7152 80\n",
|
|||
|
"2353 80\n",
|
|||
|
"Name: count, Length: 9496, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение price в контрольной выборке после oversampling:\n",
|
|||
|
"price\n",
|
|||
|
"966 34\n",
|
|||
|
"13638 34\n",
|
|||
|
"3669 34\n",
|
|||
|
"1052 34\n",
|
|||
|
"2818 34\n",
|
|||
|
" ..\n",
|
|||
|
"4032 34\n",
|
|||
|
"544 34\n",
|
|||
|
"3362 34\n",
|
|||
|
"6559 34\n",
|
|||
|
"792 34\n",
|
|||
|
"Name: count, Length: 5383, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение price в тестовой выборке после oversampling:\n",
|
|||
|
"price\n",
|
|||
|
"3742 33\n",
|
|||
|
"559 33\n",
|
|||
|
"8403 33\n",
|
|||
|
"1238 33\n",
|
|||
|
"1243 33\n",
|
|||
|
" ..\n",
|
|||
|
"1149 33\n",
|
|||
|
"2401 33\n",
|
|||
|
"958 33\n",
|
|||
|
"702 33\n",
|
|||
|
"14618 33\n",
|
|||
|
"Name: count, Length: 5338, dtype: int64\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from imblearn.over_sampling import RandomOverSampler\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"def oversample(df):\n",
|
|||
|
" X = df.drop(\"price\", axis=1)\n",
|
|||
|
" y = df[\"price\"]\n",
|
|||
|
"\n",
|
|||
|
" oversampler = RandomOverSampler(random_state=42)\n",
|
|||
|
" X_resampled, y_resampled = oversampler.fit_resample(X, y) # type: ignore\n",
|
|||
|
"\n",
|
|||
|
" resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
|
|||
|
" return resampled_df\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"train_df_oversampled = oversample(train_df)\n",
|
|||
|
"val_df_oversampled = oversample(val_df)\n",
|
|||
|
"test_df_oversampled = oversample(test_df)\n",
|
|||
|
"\n",
|
|||
|
"check_balance(train_df_oversampled, \"обучающей выборке после oversampling\")\n",
|
|||
|
"check_balance(val_df_oversampled, \"контрольной выборке после oversampling\")\n",
|
|||
|
"check_balance(test_df_oversampled, \"тестовой выборке после oversampling\")"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": "aimenv",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.12.6"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|