1241 lines
532 KiB
Plaintext
1241 lines
532 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 39,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"from sklearn.preprocessing import LabelEncoder\n",
|
|||
|
"from imblearn.over_sampling import RandomOverSampler\n",
|
|||
|
"from imblearn.under_sampling import RandomUnderSampler"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Прогнозирование стоимости страховки\thttps://www.kaggle.com/datasets/harishkumardatalab/medical-insurance-price-prediction"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Набор данных по медицинскому страхованию содержит информацию о ряде факторов, которые могут повлиять на медицинские расходы, включая возраст, пол, ИМТ, статус курящего, количество детей и регион. Этот набор данных может быть использован для обучения модели машинного обучения, которая может прогнозировать медицинские расходы для новых клиентов."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 24,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"df1 = pd.read_csv(\"..//static//csv//Medical_insurance.csv\")\n",
|
|||
|
"\n",
|
|||
|
"print(df1.columns)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 25,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAx8AAAIjCAYAAABia6bHAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAkA0lEQVR4nO3deXRW9Z348U/CElKWoKhAFBAVQa2IVbRI+VHccatVq3V0FJcZp8WClNaWYxVq3U7RcR0XqgIdl1ac1mXUIiri2FrXQbFURMEVxLoQIgMK5P7+6OE5pigChk8IvF7n5NTc+32ST+jXhLf3uU/KiqIoAgAAYD0rb+wBAACATYP4AAAAUogPAAAghfgAAABSiA8AACCF+AAAAFKIDwAAIIX4AAAAUogPAAAghfgA2AS89tprUVZWFhMmTGjsUQDYhIkPgLUwYcKEKCsrq/e21VZbxaBBg+KBBx5In+fRRx+tN0uLFi1iu+22i5NOOinmzJnTIJ/jT3/6U4wZMyYWLlzYIB8PgE1X88YeAKApOv/886N79+5RFEUsWLAgJkyYEIccckjce++9cdhhh6XPM2zYsOjbt28sW7YsnnvuuRg3blzcd999MWPGjKiurv5SH/tPf/pT/PznP48hQ4ZE+/btG2ZgADZJ4gNgHQwePDj23HPP0vunnXZadOzYMW6//fZGiY8BAwbEMcccExERp5xySuy4444xbNiwmDhxYowaNSp9HgD4LJ52BdAA2rdvH5WVldG8ef3/prN48eIYOXJkdOnSJSoqKqJnz55x6aWXRlEUERGxZMmS6NWrV/Tq1SuWLFlSetwHH3wQnTt3jn322SdWrFix1vPsu+++ERExd+7c1a575JFHYsCAAdG6deto3759fOtb34q//vWvpfNjxoyJH//4xxER0b1799LTu1577bUvnOEfn5628u3RRx9dZe2QIUM+c+2YMWPqrbvzzjtjzz33jLZt29Zbd+mll652lg8++CB+9KMfxa677hpt2rSJdu3axeDBg+P5559fZe3rr78eRxxxRLRu3Tq22mqrGDFiREyePPkzZ3/yySfj4IMPjqqqqvjKV74SAwcOjD/+8Y9f+GcDsKly5QNgHdTU1MR7770XRVHEu+++G1dffXV89NFHceKJJ5bWFEURRxxxREydOjVOO+206NOnT0yePDl+/OMfx9tvvx2XX355VFZWxsSJE6N///5xzjnnxL//+79HRMTQoUOjpqYmJkyYEM2aNVvr+V599dWIiOjQocPnrnnooYdi8ODBsd1228WYMWNiyZIlcfXVV0f//v3jueeei2233TaOOuqoePnll+P222+Pyy+/PLbYYouIiNhyyy3XaI4DDjggTjrppIiIePrpp+Oqq6763LVbbLFFXH755aX3//mf/7ne+SeeeCKOPfbY2G233eKSSy6JqqqqeO+992LEiBFfOMecOXPirrvuiu985zvRvXv3WLBgQdxwww0xcODAmDlzZumpaYsXL45999035s+fH8OHD49OnTrFbbfdFlOnTl3lYz7yyCMxePDg2GOPPWL06NFRXl4e48ePj3333Tf+53/+J/baa681+jMC2KQUAKyx8ePHFxGxyltFRUUxYcKEemvvuuuuIiKKCy64oN7xY445pigrKyteeeWV0rFRo0YV5eXlxWOPPVZMmjSpiIjiiiuu+MJ5pk6dWkREcfPNNxd/+9vfinnz5hX33Xdfse222xZlZWXF008/XRRFUcydO7eIiGL8+PGlx/bp06fYaqutivfff7907Pnnny/Ky8uLk046qXRs7NixRUQUc+fOXeM/p08++aSIiOLMM88sHVv5dU2dOnWV9SeccELRvXv3esciohg9enTp/VGjRhURUcyfP790bOXXNXbs2NXOs3Tp0mLFihX1js2dO7eoqKgozj///NKxyy67rIiI4q677iodW7JkSdGrV696s9fV1RU9evQoDjrooKKurq609v/+7/+K7t27FwcccMBq5wHYVHnaFcA6+I//+I+YMmVKTJkyJW655ZYYNGhQnH766fG73/2utOb++++PZs2axbBhw+o9duTIkVEURb1XxxozZkzssssucfLJJ8f3v//9GDhw4CqPW51TTz01ttxyy6iuro5DDz00Fi9eHBMnTqx3X8qnzZ8/P6ZPnx5DhgyJzTffvHS8d+/eccABB8T999+/xp/7syxdujQiIlq1arVG6z/55JOoqKhY7Zra2tooLy9fp5veKyoqorz87z/yVqxYEe+//360adMmevbsGc8991xp3R/+8IfYeuut44gjjigda9WqVfzLv/xLvY83ffr0mD17dvzTP/1TvP/++/Hee+/Fe++9F4sXL4799tsvHnvssairq1vrOQE2dp52BbAO9tprr3p/sT/++ONj9913jzPPPDMOO+ywaNmyZbz++utRXV0dbdu2rffYnXbaKSL+fm/BSi1btoybb745+vbtG61atYrx48dHWVnZGs9z3nnnxYABA6JZs2axxRZbxE477bTK/SeftvJz9+zZc5VzO+20U0yePDkWL14crVu3XuMZPu29996LiIiqqqo1Wr9w4cJo06bNatf069cvrrnmmhg+fHicffbZUVVVFR9++OEaffy6urq48sor49prr425c+fWu4/m009Ne/3112P77bdf5c9+hx12qPf+7NmzIyLi5JNP/tzPWVNTE5ttttkazQewqRAfAA2gvLw8Bg0aFFdeeWXMnj07dtlll7X+GJMnT46Iv181mD17dnTv3n2NH7vrrrvG/vvvv9afc31ZeUP6tttuu0br33nnnejWrdtq13z3u9+N5557Lq6++uoYN27cWs1z0UUXxbnnnhunnnpq/OIXv4jNN988ysvL46yzzlqnKxQrHzN27Njo06fPZ675opgC2BSJD4AGsnz58oiI+OijjyIiolu3bvHQQw9FbW1tvasfL730Uun8Si+88EKcf/75ccopp8T06dPj9NNPjxkzZqzxlYO1tfJzz5o1a5VzL730UmyxxRalqx5rcwVmpWeeeSYi4nOf9vVpy5Yti1deeSUOPvjg1a4rLy+PSy+9NGbMmBFz586Na6+9NhYsWFDvJv/Pc+edd8agQYPipptuqnd84cKFpZvoI/7+5zJz5swoiqLe1/3KK6/Ue9z2228fERHt2rXboKIPYEPnng+ABrBs2bJ48MEHo2XLlqWnVR1yyCGxYsWKuOaaa+qtvfzyy6OsrCwGDx5ceuyQIUOiuro6rrzyypgwYUIsWLBgjV7FaV117tw5+vTpExMnTqz3m8tffPHFePDBB+OQQw4pHVsZIWvzG87vvPPO6NmzZ/Tq1esL1959992xZMmS0ssDr87VV18djzzySNx6662x//77R//+/ddonmbNmpVe3nilSZMmxdtvv13v2EEHHRRvv/123HPPPaVjS5cujV/96lf11u2xxx6x/fbbx6WXXlqKzU/729/+tkZzAWxqXPkAWAcPPPBA6QrGu+++G7fddlvMnj07fvrTn0a7du0iIuLwww+PQYMGxTnnnBOvvfZa7LbbbvHggw/G3XffHWeddVbpv55fcMEFMX369Hj44Yejbdu20bt37zjvvPPiZz/7WRxzzDH1QqAhjR07NgYPHhz9+vWL0047rfRSu1VVVfV+v8Yee+wRERHnnHNOfPe7340WLVrE4Ycf/pn3g8yZMyd++ctfxlNPPRVHHXVU3HLLLaVzTz/9dERETJkyJbp27RqdOnWK0aNHx7XXXhv77LNPHHjggaud9y9/+UucffbZMWbMmOjbt+9afa2HHXZY6crSPvvsEzNmzIhbb701tttuu3rrzjjjjLjmmmvi+OOPj+HDh0fnzp3j1ltvLd04v/JqSHl5edx4440xePDg2GWXXeKUU06JrbfeOt5+++2YOnVqtGvXLu699961mhFgk9DIr7YF0KR81kvttmrVqujTp09x3XXX1XvZ1aIoitra2mLEiBFFdXV10aJFi6JHjx7F2LFjS+ueffbZonnz5sUPfvCDeo9bvnx50bdv36K6urr48MMPP3eelS+1O2nSpNXO/VkvtVs
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA2QAAAIjCAYAAABswtioAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAC3PUlEQVR4nOzdeXgUVdo28Dsh6SQQOiFACGtYXJIYQISAUYPLABHigssYlwFEBtCXMCM4LvkkiATNjI4vOIILyIg68II6qEjCNqDCQMTIImICLiwRMYQ1IUhoQvf3R6Z7SFLdfbpOpauq+/5dl9cl3U91Vbqrq89T55znhDgcDgeIiIiIiIjI70L1PgAiIiIiIqJgxYSMiIiIiIhIJ0zIiIiIiIiIdMKEjIiIiIiISCdMyIiIiIiIiHTChIyIiIiIiEgnTMiIiIiIiIh0woSMiIiIiIhIJ0zIiIiIiIiIdMKEjIiIDOvAgQMICQnBokWL9D6UBlavXo0rr7wSkZGRCAkJwalTp/Q+JNO64YYbkJqaqvdhEBHphgkZEZEOvvnmG9x9991ITExEZGQkOnfujKFDh+KVV15ptn0uWbIEc+bMafL44cOHMWPGDOzcubPZ9t3YZ599hpCQENd/4eHh6NmzJ0aPHo19+/Zpso8tW7ZgxowZmidLx48fxz333IOoqCjMmzcP7777Llq1auV1u1dffRUhISEYNGiQpsdjVNXV1Xj22WfRt29fREdHIyoqCqmpqXjyySdx+PBhvQ+PiMgwwvQ+ACKiYLNlyxbceOON6NatG8aPH4+EhAT89NNP+OKLL/Dyyy9j8uTJzbLfJUuWYPfu3Xj00UcbPH748GE8++yz6N69O6688spm2bc7f/jDH5CWlobz589j+/btmD9/PgoLC/HNN9+gU6dOUq+9ZcsWPPvss3jwwQcRGxurzQEDKCkpwenTp5Gfn48hQ4YIb7d48WJ0794dX375JX744Qdccsklmh2T0ezbtw9DhgxBeXk5fvvb32LChAmwWCzYtWsXFi5ciA8//BDfffed3odJRGQITMiIiPzsueeeQ0xMDEpKSpokCpWVlfocVDM4c+aM156jjIwM3H333QCAsWPH4rLLLsMf/vAHvP3228jNzfXHYfrM+Rn5kuTt378fW7ZswfLlyzFx4kQsXrwYzzzzTDMdob7q6upw55134siRI/jss89w3XXXNXj+ueeew1/+8he/H5PdbofFYvHrfomIRHDIIhGRn/3444+44oorFBv08fHxTR77xz/+gYEDB6Jly5Zo06YNBg8ejLVr17qe//jjj5GVlYVOnTohIiICvXr1Qn5+Pi5cuOCKueGGG1BYWIiDBw+6hgl2794dn332GdLS0gDUJ0TO5y6es7V161bcfPPNiImJQcuWLXH99ddj8+bNDY5xxowZCAkJQWlpKe6//360adOmSUNcxE033QSgPoHxZMOGDcjIyECrVq0QGxuL22+/HWVlZQ2O5/HHHwcA9OjRw/V3HThwwOPrvv/+++jfvz+ioqLQrl07/O53v8PPP//sev6GG27AmDFjAABpaWkICQnBgw8+6PXvWrx4Mdq0aYOsrCzcfffdWLx4sWLc8ePHMWrUKFitVsTGxmLMmDH4+uuvFefR7dmzB3fffTfi4uIQGRmJAQMGYMWKFR6P4/z584iLi8PYsWObPFddXY3IyEj86U9/cj32yiuv4IorrnCdewMGDMCSJUs87uOf//wnvv76azz99NOK54DVasVzzz3X5PHS0lLceOONaNmyJTp37owXXnihwfM2mw3Tp09H//79ERMTg1atWiEjIwOffvppgzjnvMO//vWvmDNnDnr16oWIiAiUlpYCqB8uO2DAAERGRqJXr1544403XOdvY//4xz9c50NcXBzuvfde/PTTTx7/fiIiX7GHjIjIzxITE1FcXIzdu3d7LWbw7LPPYsaMGbjmmmswc+ZMWCwWbN26FRs2bMCwYcMAAIsWLUJ0dDSmTp2K6OhobNiwAdOnT0d1dTVefPFFAMDTTz+NqqoqHDp0CLNnzwYAREdHIzk5GTNnzsT06dMxYcIEZGRkAACuueYaAPWJz/Dhw9G/f38888wzCA0NxVtvvYWbbroJmzZtwsCBAxsc729/+1tceumleP755+FwOHx+b3788UcAQNu2bd3G/Otf/8Lw4cPRs2dPzJgxA2fPnsUrr7yCa6+9Ftu3b0f37t1x55134rvvvsP//d//Yfbs2WjXrh0AoH379m5fd9GiRRg7dizS0tJQUFCAI0eO4OWXX8bmzZuxY8cOxMbG4umnn8bll1+O+fPnY+bMmejRowd69erl9e9avHgx7rzzTlgsFtx333147bXXUFJS4kqGAcBut+PWW2/Fl19+iUceeQRJSUn4+OOPXQngxb799ltce+216Ny5M5566im0atUK7733HkaOHIl//vOfuOOOOxSPIzw8HHfccQeWL1+ON954o0GP0UcffYRz587h3nvvBQAsWLAAf/jDH3D33Xfjj3/8I2pra7Fr1y5s3boV999/v9u/1ZkUjho1yuv74nTy5EncfPPNuPPOO3HPPffggw8+wJNPPonevXtj+PDhAOoTxjfffBP33Xcfxo8fj9OnT2PhwoXIzMzEl19+2WS47VtvvYXa2lpMmDABERERiIuLw44dO3DzzTejY8eOePbZZ3HhwgXMnDlT8bx47rnnkJeXh3vuuQe///3vcfToUbzyyisYPHiw63wgItKEg4iI/Grt2rWOFi1aOFq0aOFIT093PPHEE441a9Y4bDZbg7jvv//eERoa6rjjjjscFy5caPCc3W53/f+vv/7aZB8TJ050tGzZ0lFbW+t6LCsry5GYmNgktqSkxAHA8dZbbzXZx6WXXurIzMxssr8ePXo4hg4d6nrsmWeecQBw3HfffULvwaeffuoA4Pj73//uOHr0qOPw4cOOwsJCR/fu3R0hISGOkpISh8PhcOzfv7/JsV155ZWO+Ph4x/Hjx12Pff31147Q0FDH6NGjXY+9+OKLDgCO/fv3ez0em83miI+Pd6SmpjrOnj3renzlypUOAI7p06e7HnvrrbccAFzH6M1XX33lAOBYt26dw+Gof1+7dOni+OMf/9gg7p///KcDgGPOnDmuxy5cuOC46aabmrwHv/nNbxy9e/du8Pna7XbHNddc47j00ks9Hs+aNWscAByffPJJg8dHjBjh6Nmzp+vft99+u+OKK64Q+hsv1q9fP0dMTIxw/PXXX+8A4HjnnXdcj507d86RkJDguOuuu1yP1dXVOc6dO9dg25MnTzo6dOjgeOihh1yPOc8Zq9XqqKysbBB/6623Olq2bOn4+eefXY99//33jrCwMMfFTaIDBw44WrRo4XjuuecabP/NN984wsLCmjxORCSDQxaJiPxs6NChKC4uxm233Yavv/4aL7zwAjIzM9G5c+cGQ84++ugj2O12TJ8+HaGhDS/XFw+vioqKcv3/6dOncezYMWRkZODXX3/Fnj17VB/nzp078f333+P+++/H8ePHcezYMRw7dgxnzpzBb37zG2zcuBF2u73BNg8//LBP+3jooYfQvn17dOrUCVlZWThz5gzefvttDBgwQDH+l19+wc6dO/Hggw8iLi7O9XifPn0wdOhQFBUV+f6HAvjqq69QWVmJ//mf/0FkZKTr8aysLCQlJaGwsFDV6wL1vWMdOnTAjTfeCKD+s8vOzsbSpUsbDCtdvXo1wsPDMX78eNdjoaGhmDRpUoPXO3HiBDZs2IB77rnH9XkfO3YMx48fR2ZmJr7//vsGwywbu+mmm9CuXTssW7bM9djJkyexbt06ZGdnux6LjY3FoUOHUFJS4tPfW11djdatW/u0TXR0NH73u9+5/m2xWDBw4MAGFTdbtGjh6tGz2+04ceIE6urqMGDAAGzfvr3Ja951110Ner4uXLiAf/3rXxg5cmSDgjGXXHKJqxfOafny5bDb7bjnnntc7++xY8eQkJCASy+9tMkwSSIiGRyySESkg7S0NCxfvhw2mw1ff/01PvzwQ8yePRt33303du7ciZSUFPz4448IDQ1FSkq
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Визуализация данных - ящик с усами. Выборка относительно сбалансированна, пускай и смещена в меньшую сторону. Пустых значений нет.\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"sns.boxplot(x=df1[\"age\"])\n",
|
|||
|
"plt.title(\"Box Plot для age\")\n",
|
|||
|
"plt.xlabel(\"Age\")\n",
|
|||
|
"plt.show()\n",
|
|||
|
"\n",
|
|||
|
"#Визуализируем отношение стоимости и возраста\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"plt.scatter(df1[\"age\"], df1[\"charges\"])\n",
|
|||
|
"plt.xlabel('Age')\n",
|
|||
|
"plt.ylabel(\"Charge\")\n",
|
|||
|
"plt.title('Scatter Plot of Age vs Charge')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 26,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Выбросы:\n",
|
|||
|
"Empty DataFrame\n",
|
|||
|
"Columns: [age, sex, bmi, children, smoker, region, charges]\n",
|
|||
|
"Index: []\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"C:\\Users\\Admin\\AppData\\Local\\Temp\\ipykernel_8452\\3242078583.py:14: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n",
|
|||
|
" print(df[outliers])\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA2QAAAIjCAYAAABswtioAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAC3pUlEQVR4nOzdeXhU1fkH8G/2BMIEIoYAYQmghBgiiIhRExGBCHFBUUEtW5HFElvBqs1PQCBiWq0FKyiiVlygoC0qhSBEEEGJGhBETEBZI0gIewgShmHm9wedKUnunTlzz83cuTPfz/PwtM68Z+7NrOe955z3hDgcDgeIiIiIiIjI50KNPgEiIiIiIqJgxYSMiIiIiIjIIEzIiIiIiIiIDMKEjIiIiIiIyCBMyIiIiIiIiAzChIyIiIiIiMggTMiIiIiIiIgMwoSMiIiIiIjIIEzIiIiIiIiIDMKEjIiI/Na+ffsQEhKCBQsWGH0qtXzyySfo1q0boqOjERISgpMnTxp9SqbVu3dvpKWlGX0aRESGYUJGRGSA77//Hvfeey/atWuH6OhotG7dGv369cPLL7/cYMdctGgRZs+eXe/2X375BdOmTcPWrVsb7Nh1rVu3DiEhIa5/ERER6NChA4YPH449e/bocoyNGzdi2rRpuidLx44dw/3334+YmBjMnTsX7777Lho3buyx3SuvvIKQkBD06tVL1/PxV1VVVZg+fTquvvpqxMbGIiYmBmlpaXjqqafwyy+/GH16RER+I9zoEyAiCjYbN27ELbfcgrZt22LMmDFITEzEzz//jK+++govvfQSHn300QY57qJFi7B9+3Y89thjtW7/5ZdfMH36dLRv3x7dunVrkGOr+f3vf4+ePXvi/Pnz+PbbbzF//nysWLEC33//PVq1aiX12Bs3bsT06dMxcuRING3aVJ8TBlBSUoLTp08jPz8fffv2FW63cOFCtG/fHt988w127dqFTp066XZO/mbPnj3o27cvysvLcd9992Hs2LGIjIzEtm3b8Oabb+LDDz/Ejz/+aPRpEhH5BSZkREQ+NnPmTMTFxaGkpKReolBZWWnMSTWAM2fOeBw5yszMxL333gsAGDVqFK688kr8/ve/x9tvv428vDxfnKbXnK+RN0ne3r17sXHjRixduhTjxo3DwoUL8cwzzzTQGRrLZrPhnnvuweHDh7Fu3TrcdNNNte6fOXMm/vKXv/j8nOx2OyIjI316XCIiEZyySETkY7t378ZVV12l2KFPSEiod9t7772H6667Do0aNUKzZs2QlZWF1atXu+7/+OOPkZOTg1atWiEqKgodO3ZEfn4+Lly44Irp3bs3VqxYgf3797umCbZv3x7r1q1Dz549AVxMiJz3Xbpm6+uvv8Ztt92GuLg4NGrUCDfffDO+/PLLWuc4bdo0hISEoLS0FA8++CCaNWtWryMuok+fPgAuJjDurF27FpmZmWjcuDGaNm2Ku+66C2VlZbXO54knngAAJCcnu/6uffv2uX3cDz74AD169EBMTAyaN2+O3/zmNzh48KDr/t69e2PEiBEAgJ49eyIkJAQjR470+HctXLgQzZo1Q05ODu69914sXLhQMe7YsWMYNmwYLBYLmjZtihEjRuC7775TXEe3Y8cO3HvvvYiPj0d0dDSuvfZaLFu2zO15nD9/HvHx8Rg1alS9+6qqqhAdHY0//vGPrttefvllXHXVVa733rXXXotFixa5Pca///1vfPfdd3j66acV3wMWiwUzZ86sd3tpaSluueUWNGrUCK1bt8bzzz9f636r1YqpU6eiR48eiIuLQ+PGjZGZmYnPPvusVpxz3eFf//pXzJ49Gx07dkRUVBRKS0sBXJwue+211yI6OhodO3bEa6+95nr/1vXee++53g/x8fEYOnQofv75Z7d/PxGRtzhCRkTkY+3atUNxcTG2b9/usZjB9OnTMW3aNNxwww2YMWMGIiMj8fXXX2Pt2rXo378/AGDBggWIjY3FpEmTEBsbi7Vr12Lq1KmoqqrCCy+8AAB4+umncerUKRw4cACzZs0CAMTGxqJLly6YMWMGpk6dirFjxyIzMxMAcMMNNwC4mPgMGDAAPXr0wDPPPIPQ0FC89dZb6NOnDzZs2IDrrruu1vned999uOKKK/Dcc8/B4XB4/dzs3r0bAHDZZZepxnz66acYMGAAOnTogGnTpuHs2bN4+eWXceONN+Lbb79F+/btcc899+DHH3/EP//5T8yaNQvNmzcHAFx++eWqj7tgwQKMGjUKPXv2REFBAQ4fPoyXXnoJX375JbZs2YKmTZvi6aefRufOnTF//nzMmDEDycnJ6Nixo8e/a+HChbjnnnsQGRmJBx54AK+++ipKSkpcyTAA2O123HHHHfjmm2/wyCOPICUlBR9//LErAbzUDz/8gBtvvBGtW7fGn/70JzRu3Bjvv/8+Bg0ahH//+9+4++67Fc8jIiICd999N5YuXYrXXnut1ojRRx99hHPnzmHo0KEAgNdffx2///3vce+99+IPf/gDampqsG3bNnz99dd48MEHVf9WZ1I4bNgwj8+L04kTJ3Dbbbfhnnvuwf33349//etfeOqpp9C1a1cMGDAAwMWE8Y033sADDzyAMWPG4PTp03jzzTeRnZ2Nb775pt5027feegs1NTUYO3YsoqKiEB8fjy1btuC2225Dy5YtMX36dFy4cAEzZsxQfF/MnDkTU6ZMwf3334+HH34YR44cwcsvv4ysrCzX+4GISBcOIiLyqdWrVzvCwsIcYWFhjoyMDMeTTz7pWLVqlcNqtdaK++mnnxyhoaGOu+++23HhwoVa99ntdtf///XXX+sdY9y4cY5GjRo5ampqXLfl5OQ42rVrVy+2pKTEAcDx1ltv1TvGFVdc4cjOzq53vOTkZEe/fv1ctz3zzDMOAI4HHnhA6Dn47LPPHAAc//jHPxxHjhxx/PLLL44VK1Y42rdv7wgJCXGUlJQ4HA6HY+/evfXOrVu3bo6EhATHsWPHXLd99913jtDQUMfw4cNdt73wwgsOAI69e/d6PB+r1epISEhwpKWlOc6ePeu6ffny5Q4AjqlTp7pue+uttxwAXOfoyaZNmxwAHEVFRQ6H4+LzmpSU5PjDH/5QK+7f//63A4Bj9uzZrtsuXLjg6NOnT73n4NZbb3V07dq11utrt9sdN9xwg+OKK65wez6rVq1yAHD85z//qXX7wIEDHR06dHD991133eW46qqrhP7GS3Xv3t0RFxcnHH/zzTc7ADjeeecd123nzp1zJCYmOgYPHuy6zWazOc6dO1er7YkTJxwtWrRw/Pa3v3Xd5nzPWCwWR2VlZa34O+64w9GoUSPHwYMHXbf99NNPjvDwcMelXaJ9+/Y5wsLCHDNnzqzV/vvvv3eEh4fXu52ISAanLBIR+Vi/fv1QXFyMO++8E9999x2ef/55ZGdno3Xr1rWmnH300Uew2+2YOnUqQkNrf11fOr0qJibG9f9Pnz6No0ePIjMzE7/++it27Nih+Ty3bt2Kn376CQ8++CCOHTuGo0eP4ujRozhz5gxuvfVWrF+/Hna7vVab8ePHe3WM3/72t7j88svRqlUr5OTk4MyZM3j77bdx7bXXKsYfOnQIW7duxciRIxEfH++6PT09Hf369UNhYaH3fyiATZs2obKyEr/73e8QHR3tuj0nJwcpKSlYsWKFpscFLo6OtWjRArfccguAi6/dkCFDsHjx4lrTSj/55BNERERgzJgxrttCQ0MxYcKEWo93/PhxrF27Fvfff7/r9T569CiOHTuG7Oxs/PTTT7WmWdbVp08fNG/eHEuWLHHdduLECRQVFWHIkCGu25o2bYoDBw6gpKTEq7+3qqoKTZo08apNbGwsfvOb37j+OzIyEtddd12tipthYWGuET273Y7jx4/DZrPh2muvxbffflvvMQcPHlxr5OvChQv49NNPMWjQoFoFYzp16uQahXNaunQp7HY77r//ftfze/ToUSQmJuKKK66oN02SiEgGpywSERmgZ8+eWLp0KaxWK7777jt8+OGHmDVrFu69915s3boVqamp2L17N0JDQ5Gamur2sX744Qd
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Есть шумы, убираем\n",
|
|||
|
"\n",
|
|||
|
"# Статистический анализ для определения выбросов\n",
|
|||
|
"Q1 = df1[\"charges\"].quantile(0.25)\n",
|
|||
|
"Q3 = df1[\"charges\"].quantile(0.75)\n",
|
|||
|
"IQR = Q3 - Q1\n",
|
|||
|
"\n",
|
|||
|
"# Определение порога для выбросов\n",
|
|||
|
"threshold = 1.5 * IQR\n",
|
|||
|
"outliers = (df1[\"charges\"] < (Q1 - threshold)) | (df1[\"charges\"] > (Q3 + threshold))\n",
|
|||
|
"\n",
|
|||
|
"# Вывод выбросов\n",
|
|||
|
"print(\"Выбросы:\")\n",
|
|||
|
"print(df[outliers])\n",
|
|||
|
"\n",
|
|||
|
"# Обработка выбросов\n",
|
|||
|
"# В данном случае мы заменим выбросы на медианное значение\n",
|
|||
|
"median_charge = df1[\"charges\"].median()\n",
|
|||
|
"df1.loc[outliers, \"charges\"] = 0\n",
|
|||
|
"df1 = df1[df1.charges != 0]\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация данных после обработки\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"plt.scatter(df1[\"age\"], df1[\"charges\"])\n",
|
|||
|
"plt.xlabel(\"Age\")\n",
|
|||
|
"plt.ylabel(\"Charge\")\n",
|
|||
|
"plt.title(\"Scatter Plot of Age vs Charge\")\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 32,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки: 1485\n",
|
|||
|
"Размер контрольной выборки: 495\n",
|
|||
|
"Размер тестовой выборки: 496\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"#Разбиение набора данных на обучающую, контрольную и тестовую выборки\n",
|
|||
|
"\n",
|
|||
|
"train_df1, temp_df1 = train_test_split(df1, test_size=0.4, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение остатка на контрольную и тестовую выборки\n",
|
|||
|
"val_df1, test_df1 = train_test_split(temp_df1, test_size=0.5, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Проверка размеров выборок\n",
|
|||
|
"print(\"Размер обучающей выборки:\", len(train_df1))\n",
|
|||
|
"print(\"Размер контрольной выборки:\", len(val_df1))\n",
|
|||
|
"print(\"Размер тестовой выборки:\", len(test_df1))\n",
|
|||
|
"\n",
|
|||
|
"# Сохранение выборок в файлы\n",
|
|||
|
"train_df1.to_csv(\"..//static//csv//train_data.csv\", index=False)\n",
|
|||
|
"val_df1.to_csv(\"..//static//csv//val_data.csv\", index=False)\n",
|
|||
|
"test_df1.to_csv(\"..//static//csv//test_data.csv\", index=False)\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 34,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение Review_type в обучающей выборке:\n",
|
|||
|
"sex\n",
|
|||
|
"female 765\n",
|
|||
|
"male 720\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент мужчин: 48.48%\n",
|
|||
|
"Процент женщин: 51.52%\n",
|
|||
|
"\n",
|
|||
|
"Распределение Review_type в контрольной выборке:\n",
|
|||
|
"sex\n",
|
|||
|
"male 257\n",
|
|||
|
"female 238\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент мужчин: 51.92%\n",
|
|||
|
"Процент женщин: 48.08%\n",
|
|||
|
"\n",
|
|||
|
"Распределение Review_type в тестовой выборке:\n",
|
|||
|
"sex\n",
|
|||
|
"female 259\n",
|
|||
|
"male 237\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент мужчин: 47.78%\n",
|
|||
|
"Процент женщин: 52.22%\n",
|
|||
|
"\n",
|
|||
|
"Аугментация данных не требуется.\n",
|
|||
|
"Аугментация данных не требуется.\n",
|
|||
|
"Аугментация данных не требуется.\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"#Проанализируем сбалансированность выборок\n",
|
|||
|
"train_df1 = pd.read_csv(\"..//static//csv//train_data.csv\")\n",
|
|||
|
"val_df1 = pd.read_csv(\"..//static//csv//val_data.csv\")\n",
|
|||
|
"test_df1 = pd.read_csv(\"..//static//csv//test_data.csv\")\n",
|
|||
|
"\n",
|
|||
|
"# Оценка сбалансированности\n",
|
|||
|
"def check_balance(df1, name):\n",
|
|||
|
" counts = df1['sex'].value_counts()\n",
|
|||
|
" print(f\"Распределение Review_type в {name}:\")\n",
|
|||
|
" print(counts)\n",
|
|||
|
" print(f\"Процент мужчин: {counts['male'] / len(df1) * 100:.2f}%\")\n",
|
|||
|
" print(f\"Процент женщин: {counts['female'] / len(df1) * 100:.2f}%\")\n",
|
|||
|
" print()\n",
|
|||
|
"\n",
|
|||
|
"# Определение необходимости аугментации данных\n",
|
|||
|
"def need_augmentation(df1):\n",
|
|||
|
" counts = df1['sex'].value_counts()\n",
|
|||
|
" ratio = counts['male'] / counts['female']\n",
|
|||
|
" if ratio > 1.5 or ratio < 0.67:\n",
|
|||
|
" print(\"Необходима аугментация данных для балансировки классов.\")\n",
|
|||
|
" else:\n",
|
|||
|
" print(\"Аугментация данных не требуется.\")\n",
|
|||
|
" \n",
|
|||
|
"check_balance(train_df1, \"обучающей выборке\")\n",
|
|||
|
"check_balance(val_df1, \"контрольной выборке\")\n",
|
|||
|
"check_balance(test_df1, \"тестовой выборке\")\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"need_augmentation(train_df1)\n",
|
|||
|
"need_augmentation(val_df1)\n",
|
|||
|
"need_augmentation(test_df1)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 35,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Оверсэмплинг:\n",
|
|||
|
"Распределение sex в обучающей выборке:\n",
|
|||
|
"sex\n",
|
|||
|
"male 765\n",
|
|||
|
"female 765\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент мужчин: 50.00%\n",
|
|||
|
"Процент женщин: 50.00%\n",
|
|||
|
"\n",
|
|||
|
"Распределение sex в контрольной выборке:\n",
|
|||
|
"sex\n",
|
|||
|
"male 257\n",
|
|||
|
"female 257\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент мужчин: 50.00%\n",
|
|||
|
"Процент женщин: 50.00%\n",
|
|||
|
"\n",
|
|||
|
"Распределение sex в тестовой выборке:\n",
|
|||
|
"sex\n",
|
|||
|
"female 259\n",
|
|||
|
"male 259\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент мужчин: 50.00%\n",
|
|||
|
"Процент женщин: 50.00%\n",
|
|||
|
"\n",
|
|||
|
"Андерсэмплинг:\n",
|
|||
|
"Распределение sex в обучающей выборке:\n",
|
|||
|
"sex\n",
|
|||
|
"female 720\n",
|
|||
|
"male 720\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент мужчин: 50.00%\n",
|
|||
|
"Процент женщин: 50.00%\n",
|
|||
|
"\n",
|
|||
|
"Распределение sex в контрольной выборке:\n",
|
|||
|
"sex\n",
|
|||
|
"female 238\n",
|
|||
|
"male 238\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент мужчин: 50.00%\n",
|
|||
|
"Процент женщин: 50.00%\n",
|
|||
|
"\n",
|
|||
|
"Распределение sex в тестовой выборке:\n",
|
|||
|
"sex\n",
|
|||
|
"female 237\n",
|
|||
|
"male 237\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент мужчин: 50.00%\n",
|
|||
|
"Процент женщин: 50.00%\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"from sklearn.preprocessing import LabelEncoder\n",
|
|||
|
"from imblearn.over_sampling import RandomOverSampler\n",
|
|||
|
"from imblearn.under_sampling import RandomUnderSampler\n",
|
|||
|
"\n",
|
|||
|
"# Загрузка данных\n",
|
|||
|
"train_df1 = pd.read_csv(\"..//static//csv//train_data.csv\")\n",
|
|||
|
"val_df1 = pd.read_csv(\"..//static//csv//val_data.csv\")\n",
|
|||
|
"test_df1 = pd.read_csv(\"..//static//csv//test_data.csv\")\n",
|
|||
|
"\n",
|
|||
|
"# Преобразование категориальных признаков в числовые\n",
|
|||
|
"def encode(df1):\n",
|
|||
|
" label_encoders = {}\n",
|
|||
|
" for column in df1.select_dtypes(include=['object']).columns:\n",
|
|||
|
" if column != 'sex': # Пропускаем целевую переменную\n",
|
|||
|
" le = LabelEncoder()\n",
|
|||
|
" df1[column] = le.fit_transform(df1[column])\n",
|
|||
|
" label_encoders[column] = le\n",
|
|||
|
" return label_encoders\n",
|
|||
|
"\n",
|
|||
|
"# Преобразование целевой переменной в числовые значения\n",
|
|||
|
"def encode_target(df1):\n",
|
|||
|
" le = LabelEncoder()\n",
|
|||
|
" df1['sex'] = le.fit_transform(df1['sex'])\n",
|
|||
|
" return le\n",
|
|||
|
"\n",
|
|||
|
"# Применение кодирования\n",
|
|||
|
"label_encoders = encode(train_df1)\n",
|
|||
|
"encode(val_df1)\n",
|
|||
|
"encode(test_df1)\n",
|
|||
|
"\n",
|
|||
|
"# Кодирование целевой переменной\n",
|
|||
|
"le_target = encode_target(train_df1)\n",
|
|||
|
"encode_target(val_df1)\n",
|
|||
|
"encode_target(test_df1)\n",
|
|||
|
"\n",
|
|||
|
"# Проверка типов данных\n",
|
|||
|
"def check_data_types(df1):\n",
|
|||
|
" for column in df1.columns:\n",
|
|||
|
" if df1[column].dtype == 'object':\n",
|
|||
|
" print(f\"Столбец '{column}' содержит строковые данные.\")\n",
|
|||
|
"\n",
|
|||
|
"check_data_types(train_df1)\n",
|
|||
|
"check_data_types(val_df1)\n",
|
|||
|
"check_data_types(test_df1)\n",
|
|||
|
"\n",
|
|||
|
"# Функция для выполнения oversampling\n",
|
|||
|
"def oversample(df1):\n",
|
|||
|
" if 'sex' not in df1.columns:\n",
|
|||
|
" print(\"Столбец 'sex' отсутствует.\")\n",
|
|||
|
" return df1\n",
|
|||
|
" \n",
|
|||
|
" X = df1.drop('sex', axis=1)\n",
|
|||
|
" y = df1['sex']\n",
|
|||
|
" \n",
|
|||
|
" oversampler = RandomOverSampler(random_state=42)\n",
|
|||
|
" X_resampled, y_resampled = oversampler.fit_resample(X, y) # type: ignore\n",
|
|||
|
" \n",
|
|||
|
" resampled_df1 = pd.concat([X_resampled, y_resampled], axis=1)\n",
|
|||
|
" return resampled_df1\n",
|
|||
|
"\n",
|
|||
|
"# Функция для выполнения undersampling\n",
|
|||
|
"def undersample(df1):\n",
|
|||
|
" if 'sex' not in df1.columns:\n",
|
|||
|
" print(\"Столбец 'sex' отсутствует.\")\n",
|
|||
|
" return df1\n",
|
|||
|
" \n",
|
|||
|
" X = df1.drop('sex', axis=1)\n",
|
|||
|
" y = df1['sex']\n",
|
|||
|
" \n",
|
|||
|
" undersampler = RandomUnderSampler(random_state=42)\n",
|
|||
|
" X_resampled, y_resampled = undersampler.fit_resample(X, y) # type: ignore\n",
|
|||
|
" \n",
|
|||
|
" resampled_df1 = pd.concat([X_resampled, y_resampled], axis=1)\n",
|
|||
|
" return resampled_df1\n",
|
|||
|
"\n",
|
|||
|
"# Применение oversampling и undersampling к каждой выборке\n",
|
|||
|
"train_df1_oversampled = oversample(train_df1)\n",
|
|||
|
"val_df1_oversampled = oversample(val_df1)\n",
|
|||
|
"test_df1_oversampled = oversample(test_df1)\n",
|
|||
|
"\n",
|
|||
|
"train_df1_undersampled = undersample(train_df1)\n",
|
|||
|
"val_df1_undersampled = undersample(val_df1)\n",
|
|||
|
"test_df1_undersampled = undersample(test_df1)\n",
|
|||
|
"\n",
|
|||
|
"# Обратное преобразование целевой переменной в строковые метки\n",
|
|||
|
"def decode_target(df1, le_target):\n",
|
|||
|
" df1['sex'] = le_target.inverse_transform(df1['sex'])\n",
|
|||
|
"\n",
|
|||
|
"decode_target(train_df1_oversampled, le_target)\n",
|
|||
|
"decode_target(val_df1_oversampled, le_target)\n",
|
|||
|
"decode_target(test_df1_oversampled, le_target)\n",
|
|||
|
"\n",
|
|||
|
"decode_target(train_df1_undersampled, le_target)\n",
|
|||
|
"decode_target(val_df1_undersampled, le_target)\n",
|
|||
|
"decode_target(test_df1_undersampled, le_target)\n",
|
|||
|
"\n",
|
|||
|
"# Проверка результатов\n",
|
|||
|
"def check_balance(df1, name):\n",
|
|||
|
" if 'sex' not in df1.columns:\n",
|
|||
|
" print(f\"Столбец 'sex' отсутствует в {name}.\")\n",
|
|||
|
" return\n",
|
|||
|
" \n",
|
|||
|
" counts = df1['sex'].value_counts()\n",
|
|||
|
" print(f\"Распределение sex в {name}:\")\n",
|
|||
|
" print(counts)\n",
|
|||
|
" \n",
|
|||
|
" if 'male' in counts and 'female' in counts:\n",
|
|||
|
" print(f\"Процент мужчин: {counts['male'] / len(df1) * 100:.2f}%\")\n",
|
|||
|
" print(f\"Процент женщин: {counts['female'] / len(df1) * 100:.2f}%\")\n",
|
|||
|
" else:\n",
|
|||
|
" print(\"Отсутствуют один или оба класса (male/female).\")\n",
|
|||
|
" print()\n",
|
|||
|
"\n",
|
|||
|
"# Проверка сбалансированности после oversampling\n",
|
|||
|
"print(\"Оверсэмплинг:\")\n",
|
|||
|
"check_balance(train_df1_oversampled, \"обучающей выборке\")\n",
|
|||
|
"check_balance(val_df1_oversampled, \"контрольной выборке\")\n",
|
|||
|
"check_balance(test_df1_oversampled, \"тестовой выборке\")\n",
|
|||
|
"\n",
|
|||
|
"# Проверка сбалансированности после undersampling\n",
|
|||
|
"print(\"Андерсэмплинг:\")\n",
|
|||
|
"check_balance(train_df1_undersampled, \"обучающей выборке\")\n",
|
|||
|
"check_balance(val_df1_undersampled, \"контрольной выборке\")\n",
|
|||
|
"check_balance(test_df1_undersampled, \"тестовой выборке\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Объекты вокруг Земли\thttps://www.kaggle.com/datasets/sameepvani/nasa-nearest-earth-objects"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"В космическом пространстве существует бесконечное количество объектов. Некоторые из них находятся ближе, чем мы думаем. Хотя нам может казаться, что расстояние в 70 000 км не может причинить нам вред, в астрономическом масштабе это очень маленькое расстояние, которое может нарушить многие природные явления. Таким образом, эти объекты/астероиды могут причинить вред. Поэтому разумно знать, что нас окружает и что может причинить нам вред. Таким образом, этот набор данных содержит список сертифицированных НАСА астероидов, которые классифицируются как ближайшие к Земле объекты."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 44,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Index(['id', 'name', 'est_diameter_min', 'est_diameter_max',\n",
|
|||
|
" 'relative_velocity', 'miss_distance', 'orbiting_body', 'sentry_object',\n",
|
|||
|
" 'absolute_magnitude', 'hazardous'],\n",
|
|||
|
" dtype='object')\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"df2 = pd.read_csv(\"..//static//csv//neo.csv\")\n",
|
|||
|
"\n",
|
|||
|
"print(df2.columns)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 45,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"id 0\n",
|
|||
|
"name 0\n",
|
|||
|
"est_diameter_min 0\n",
|
|||
|
"est_diameter_max 0\n",
|
|||
|
"relative_velocity 0\n",
|
|||
|
"miss_distance 0\n",
|
|||
|
"orbiting_body 0\n",
|
|||
|
"sentry_object 0\n",
|
|||
|
"absolute_magnitude 0\n",
|
|||
|
"hazardous 0\n",
|
|||
|
"dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"id False\n",
|
|||
|
"name False\n",
|
|||
|
"est_diameter_min False\n",
|
|||
|
"est_diameter_max False\n",
|
|||
|
"relative_velocity False\n",
|
|||
|
"miss_distance False\n",
|
|||
|
"orbiting_body False\n",
|
|||
|
"sentry_object False\n",
|
|||
|
"absolute_magnitude False\n",
|
|||
|
"hazardous False\n",
|
|||
|
"dtype: bool\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"#Получение сведений о пропущенных данных\n",
|
|||
|
"\n",
|
|||
|
"# Количество пустых значений признаков\n",
|
|||
|
"print(df2.isnull().sum())\n",
|
|||
|
"\n",
|
|||
|
"print()\n",
|
|||
|
"\n",
|
|||
|
"# Есть ли пустые значения признаков\n",
|
|||
|
"print(df2.isnull().any())\n",
|
|||
|
"\n",
|
|||
|
"print()\n",
|
|||
|
"\n",
|
|||
|
"# Процент пустых значений признаков\n",
|
|||
|
"for i in df2.columns:\n",
|
|||
|
" null_rate = df2[i].isnull().sum() / len(df2) * 100\n",
|
|||
|
" if null_rate > 0:\n",
|
|||
|
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Итог: пропущеных значений нет\n",
|
|||
|
"\n",
|
|||
|
"Инфографика на сайте и в datawrangelere показывает, что в столбцах orbiting_body и sentry_object у всех записей одно и тоже значение. Значит эти столбцы можно выкинуть из набора данных."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 46,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"количество колонок: 8\n",
|
|||
|
"колонки: id, name, est_diameter_min, est_diameter_max, relative_velocity, miss_distance, absolute_magnitude, hazardous\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"df2 = df2.drop(columns=['orbiting_body'])\n",
|
|||
|
"df2 = df2.drop(columns=['sentry_object'])\n",
|
|||
|
"print('количество колонок: ' + str(df2.columns.size)) \n",
|
|||
|
"print('колонки: ' + ', '.join(df2.columns))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 50,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAZ0AAAK9CAYAAAD/m7EJAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABXv0lEQVR4nO3dd3xT9f4/8FfSJulMS+mGDpZgmdciWFYr0FaWIHC5gkpBxMEQ6UWEL7JcVVBEEAGvl6HAVQFxsmpZggW0gFCWFFlCBwW62zRNPr8/aPMjbQppmp604fV8PPqAfM7nnLyTpp9XzjmfnMiEEAJEREQSkNu6ACIiun8wdIiISDIMHSIikgxDh4iIJMPQISIiyTB0iIhIMgwdIiKSDEOHiIgkw9AhIiLJMHTIboSGhmLMmDGG23v27IFMJsOePXtsVtP9ICoqClFRUbYuo0bWrFkDmUyG33//3dalWEVDeq3XSeisWLECsbGx8PPzg0KhgL+/PyIjI/H5559Dr9fXxV1SA7NhwwYsXrzY1mXYXFFREebNm9cgBgsia3Csi42uXbsWAQEBmD17NtRqNXJycnDw4EGMGTMG27Ztw//+97+6uFtqQDZs2IDU1FS88sordXYfvXr1QnFxMZRKZZ3dR20VFRVh/vz5ANDg9hYq7Ny509YlUANSJ6Gzb98+KBQKo7aXX34ZjRs3xscff4yEhASEhobWxV0TGcjlcjg5Odm6DJsoLCyEq6urJPdVn0PdXgghUFJSAmdnZ1uXUmt1cnitcuBUqAgaufz/3+13332HAQMGIDAwECqVCi1atMCbb74JnU5ntG5UVBRkMpnhx9vbGwMGDEBqaqpRP5lMhnnz5hm1LVy4EDKZrMo7yZKSEsybNw8PPPAAnJycEBAQgKFDh+L8+fMAgIsXL0Imk2HNmjVG602cOBEymczo/EHFMWKlUonr168b9U9OTjbUXfkY8saNGxEeHg5nZ2d4e3vj6aefxtWrV6s8d2fOnMGIESPg4+MDZ2dntG7dGrNmzQIAzJs3z+i5MfVTcfgmKioK7dq1q7J9cx06dAiPPfYYPDw84OLigsjISBw4cMCoT35+Pl555RWEhoZCpVLB19cX0dHROHLkiKGGn376CZcuXTLUV5M3IUIIvPXWW2jatClcXFzw6KOP4uTJk1X6mTrO/csvv+Cf//wngoODoVKpEBQUhKlTp6K4uNho3TFjxsDNzQ2XL1/GwIED4ebmhiZNmmDZsmUAgBMnTqB3795wdXVFSEgINmzYUOX+c3Jy8MorryAoKAgqlQotW7bEe++9ZzjEfPHiRfj4+AAA5s+fb3gu7nz9njlzBsOHD4eXlxecnJzQuXNnfP/990b3U/Ha27t3LyZMmABfX180bdrUrOey4jX+/vvvY9myZWjevDlcXFwQExODK1euQAiBN998E02bNoWzszMGDx6MmzdvGm2j8jmdiuf966+/xttvv42mTZvCyckJffr0QVpamll1HT16FP369YNarYabmxv69OmDgwcPVnnMd/up/HdrikajQXx8PHx8fODq6oonnniiyt+vOWPU3eq587lZvXo1evfuDV9fX6hUKoSFhWH58uVV6goNDcXAgQOxY8cOdO7cGc7Ozli5ciUA4O+//8aQIUPg6uoKX19fTJ06FRqNxuTjM2d8qe6c3JgxY6r8XX755ZcIDw+Hu7s71Go12rdvj48++uhuT3EVdbKnUyEnJwdlZWXIz89HSkoK3n//fTz55JMIDg429FmzZg3c3NwQHx8PNzc37Nq1C3PmzEFeXh4WLlxotL02bdpg1qxZEELg/PnzWLRoEfr374/Lly/ftYaEhIQq7TqdDgMHDkRSUhKefPJJTJkyBfn5+UhMTERqaipatGhhcntpaWn4z3/+U+39OTg4YN26dZg6daqhbfXq1XByckJJSYlR3zVr1mDs2LF4+OGHkZCQgMzMTHz00Uc4cOAAjh49Ck9PTwDA8ePH0bNnTygUCjz//PMIDQ3F+fPn8cMPP+Dtt9/G0KFD0bJlS8N2p06digcffBDPP/+8oe3BBx+stmZz7dq1C/369UN4eDjmzp0LuVxu+CP65Zdf0KVLFwDAiy++iE2bNmHSpEkICwvDjRs3sH//fpw+fRoPPfQQZs2ahdzcXPz999/48MMPAQBubm5m1zFnzhy89dZb6N+/P/r3748jR44gJiYGpaWl91x348aNKCoqwksvvYTGjRvj8OHDWLp0Kf7++29s3LjRqK9Op0O/fv3Qq1cvLFiwAOvXr8ekSZPg6uqKWbNm4amnnsLQoUOxYsUKjB49GhEREWjWrBmA24fNIiMjcfXqVbzwwgsIDg7Gr7/+ipkzZyI9PR2LFy+Gj48Pli9fjpdeeglPPPEEhg4dCgDo0KEDAODkyZPo3r07mjRpghkzZsDV1RVff/01hgwZgs2bN+OJJ54wqnfChAnw8fHBnDlzUFhYaPbzCQDr169HaWkpJk+ejJs3b2LBggUYMWIEevfujT179uC1115DWloali5dimnTpmHVqlX33Oa7774LuVyOadOmITc3FwsWLMBTTz2FQ4cO3XW9kydPomfPnlCr1Zg+fToUCgVWrlyJqKgo7N27F127dkWvXr3wxRdfGNZ5++23AcDwRgwAunXrds8aJ0+ejEaNGmHu3Lm4ePEiFi9ejEmTJuGrr74y9DFnjKpcDwBcunQJr7/+Onx9fQ1ty5cvR9u2bfH444/D0dERP/zwAyZMmAC9Xo+JEycarX/27FmMHDkSL7zwAsaPH4/WrVujuLgYffr0weXLl/Hyyy8jMDAQX3zxBXbt2lXlsZk7vpgrMTERI0eORJ8+ffDee+8BAE6fPo0DBw5gypQp5m9I1KHWrVsLAIaf0aNHC61Wa9SnqKioynovvPCCcHFxESUlJYa2yMhIERkZadTv//7v/wQAkZWVZWgDIObOnWu4PX36dOHr6yvCw8ON1l+1apUAIBYtWlTl/vV6vRBCiAsXLggAYvXq1YZlI0aMEO3atRNBQUEiLi7O0L569WoBQIwcOVK0b9/e0F5YWCjUarUYNWqUACB+++03IYQQpaWlwtfXV7Rr104UFxcb+v/4448CgJgzZ46hrVevXsLd3V1cunTJZJ2VhYSEGNV2p8jISNG2bVuTy+5Gr9eLVq1aidjYWKP7LSoqEs2aNRPR0dGGNg8PDzFx4sS7bm/AgAEiJCSkxnVkZWUJpVIpBgwYYFRHxWvhzse9e/duAUDs3r3bqN7KEhIShEwmM3p+4+LiBADxzjvvGNpu3bolnJ2dhUwmE19++aWh/cyZM1Ved2+++aZwdXUVf/75p9F9zZgxQzg4OIjLly8LIYS4fv16lXUr9OnTR7Rv397o70Cv14tu3bqJVq1aGdoqXns9evQQZWVlJp616lW8xn18fEROTo6hfebMmQKA6Nixo9Hf7MiRI4VSqbzr32bF8/7ggw8KjUZjaP/oo48EAHHixIm71jRkyBChVCrF+fPnDW3Xrl0T7u7uolevXibXMTU+3E3Fc9a3b1+j19HUqVOFg4OD0XNh7hh1p+LiYhEeHi4CAwNFenr6XbcVGxsrmjdvbtQWEhIiAIjt27cbtS9evFgAEF9//bWhrbCwULRs2dLotV6T8aW65y4uLs7ob3TKlClCrVbX+DVWWZ1OmV69ejUSExOxfv16jBs3DuvXrzd69w3A6Bhlfn4+srOz0bNnTxQVFeHMmTNGfbVaLbKzs3H9+nUkJydjy5Yt6NChA7y9vU3e/9WrV7F06VLMnj27yjvpzZs3w9vbG5MnT66ynkwmM7m9lJQUbNy4EQkJCUaHCO/0zDPP4MyZM4bDaJs3b4aHhwf69Olj1O/3339HVlYWJkyYYHTeYcCAAWjTpg1++uknAMD169exb98+PPvss0Z7iHer8150Oh2ys7ORnZ1t1t4BABw7dgznzp3DqFGjcOPGDcP6hYWF6NOnD/bt22c4bOTp6YlDhw7h2rVrFtV3Nz///LPhHfmdj9/cCQl3vt4KCwuRnZ2Nbt26QQiBo0ePVun/3HPPGf7v6emJ1q1bw9XVFSN
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 400x800 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAaAAAAK9CAYAAABilriBAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABV9klEQVR4nO3deZyN9f//8eeZfTMzllnQmLFlp0+kECOGEUqptGfpW31CWT4SLZZUkpLI0mr5pE+b9hChlIRIJUvIFmbGOvt+rt8fZs7PmYWZ4/BmPO6329zMeV/v6zqvc5lzPc91Xe9zXTbLsiwBAHCeeZguAABwaSKAAABGEEAAACMIIACAEQQQAMAIAggAYAQBBAAwggACABhBAAEAjCCAcNGLiYlRv379HI+/++472Ww2fffdd8ZqwsVj3LhxstlsOnLkiOlS3GLu3Lmy2Wzas2eP6VLOyK0BNHv2bMXHxysiIkLe3t6KjIxUbGys5s+fL7vd7s6nwkXqvffe09SpU02XYVxGRobGjRtHSOKS5uXOhc2bN0/Vq1fX008/reDgYJ04cUI///yz+vXrp8WLF+t///ufO58OF6H33ntPmzdv1tChQ8/Zc3To0EGZmZny8fE5Z89xtjIyMjR+/HhJUseOHc0WAxji1gBatWqVvL29ndoeffRRVa1aVa+99pomTpyomJgYdz4lUIyHh4f8/PxMl2FEenq6AgMDTZeBc8hutysnJ6dC/I279RBc0fApVBg6Hh7//+k+//xz9ejRQzVq1JCvr6/q1q2rCRMmKD8/32nejh07ymazOX6qVaumHj16aPPmzU79bDabxo0b59Q2efJk2Wy2Yp8ws7KyNG7cOF1++eXy8/NT9erV1bt3b+3atUuStGfPHtlsNs2dO9dpvkGDBslmszmdbyg83urj46PDhw879V+zZo2j7l9++cVp2kcffaSWLVvK399f1apV0z333KMDBw4UW3fbtm1Tnz59FBYWJn9/fzVo0EBPPvmkpP9/7Pp0P4WHeDp27KimTZsWW35ZrV27Vt26dVNISIgCAgIUGxur1atXO/VJTU3V0KFDFRMTI19fX4WHh6tLly7auHGjo4avv/5ae/fuddRXng8klmXp2Wef1WWXXaaAgABdd911+vPPP4v1K+kc0A8//KDbbrtNtWrVkq+vr6KiojRs2DBlZmY6zduvXz8FBQVp37596tmzp4KCglSzZk3NmDFDkvTHH3+oU6dOCgwMVHR0tN57771iz3/ixAkNHTpUUVFR8vX1Vb169TRp0iTHYeg9e/YoLCxMkjR+/HjHujj173fbtm269dZbVaVKFfn5+alVq1b64osvnJ6n8G/v+++/18CBAxUeHq7LLrusTOuy8G/8pZde0owZM1SnTh0FBASoa9eu2r9/vyzL0oQJE3TZZZfJ399fvXr10rFjx5yWUZb38NatW+Xv76/77rvPad4ff/xRnp6eevzxx89Y64oVK9S+fXsFBgYqNDRUvXr10tatWx3Ty/M+OJ0TJ06oX79+Cg0NVUhIiPr376+MjAynPnPmzFGnTp0UHh4uX19fNW7cWLNmzXLqc7p6Tt12vPTSS2rbtq2qVq0qf39/tWzZUh9//HGxumw2mwYPHqwFCxaoSZMm8vX11ZIlSyRJf/75pzp16iR/f39ddtllevbZZ0s93TFz5kzH/DVq1NCgQYN04sQJpz5Fz6cW6tixY7Ht6PTp09WkSRMFBASocuXKatWqVYnvh9Nx6x5QoRMnTigvL0+pqanasGGDXnrpJd1xxx2qVauWo8/cuXMVFBSk4cOHKygoSCtWrNCYMWOUkpKiyZMnOy2vYcOGevLJJ2VZlnbt2qUpU6aoe/fu2rdv32lrmDhxYrH2/Px89ezZU8uXL9cdd9yhIUOGKDU1VcuWLdPmzZtVt27dEpe3c+dOvfnmm6U+n6enp959910NGzbM0TZnzhz5+fkpKyvLqe/cuXPVv39/XXXVVZo4caISExP16quvavXq1fr1118VGhoqSfr999/Vvn17eXt768EHH1RMTIx27dqlL7/8Us8995x69+6tevXqOZY7bNgwNWrUSA8++KCjrVGjRqXWXFYrVqzQ9ddfr5YtW2rs2LHy8PBwvBF/+OEHtW7dWpL073//Wx9//LEGDx6sxo0b6+jRo/rxxx+1detWXXnllXryySeVnJysf/75R6+88ookKSgoqMx1jBkzRs8++6y6d++u7t27a+PGjeratatycnLOOO9HH32kjIwMPfzww6patarWrVun6dOn659//tFHH33k1Dc/P1/XX3+9OnTooBdffFELFizQ4MGDFRgYqCeffFJ33323evfurdmzZ+u+++5TmzZtVLt2bUknD63FxsbqwIEDeuihh1SrVi399NNPGj16tA4dOqSpU6cqLCxMs2bN0sMPP6ybb75ZvXv3liQ1b95c0smNSrt27VSzZk2NGjVKgYGB+vDDD3XTTTdp4cKFuvnmm53qHThwoMLCwjRmzBilp6eXeX1K0oIFC5STk6NHHnlEx44d04svvqg+ffqoU6dO+u677/T4449r586dmj59ukaMGKF33nnHMW9Z3sONGjXShAkT9Nhjj+nWW2/VjTfeqPT0dPXr108NGzbUM888c9r6vv32W11//fWqU6eOxo0bp8zMTE2fPl3t2rXTxo0bFRMT47b3QZ8+fVS7dm1NnDhRGzdu1FtvvaXw8HBNmjTJ0WfWrFlq0qSJbrzxRnl5eenLL7/UwIEDZbfbNWjQIEkqVo8kbdiwQVOnTlV4eLij7dVXX9WNN96ou+++Wzk5OXr//fd122236auvvlKPHj2c5l+xYoU+/PBDDR48WNWqVVNMTIwSEhJ03XXXKS8vz/F38sYbb8jf37/Yaxs3bpzGjx+vuLg4Pfzww9q+fbtmzZql9evXa/Xq1aXuPJTmzTff1KOPPqpbb71VQ4YMUVZWln7//XetXbtWd911V9kXZJ0DDRo0sCQ5fu677z4rNzfXqU9GRkax+R566CErICDAysrKcrTFxsZasbGxTv2eeOIJS5KVlJTkaJNkjR071vF45MiRVnh4uNWyZUun+d955x1LkjVlypRiz2+32y3Lsqzdu3dbkqw5c+Y4pvXp08dq2rSpFRUVZfXt29fRPmfOHEuSdeedd1rNmjVztKenp1vBwcHWXXfdZUmy1q9fb1mWZeXk5Fjh4eFW06ZNrczMTEf/r776ypJkjRkzxtHWoUMHq1KlStbevXtLrLOo6Ohop9pOFRsbazVp0qTEaadjt9ut+vXrW/Hx8U7Pm5GRYdWuXdvq0qWLoy0kJMQaNGjQaZfXo0cPKzo6utx1JCUlWT4+PlaPHj2c6ij8Wzj1da9cudKSZK1cudKp3qImTpxo2Ww2p/Xbt29fS5L1/PPPO9qOHz9u+fv7WzabzXr//fcd7du2bSv2dzdhwgQrMDDQ+uuvv5yea9SoUZanp6e1b98+y7Is6/Dhw8XmLdS5c2erWbNmTu8Du91utW3b1qpfv76jrfBv79prr7Xy8vJKWGulK/wbDwsLs06cOOFoHz16tCXJatGihdN79s4777R8fHycairrezg/P9+69tprrYiICOvIkSPWoEGDLC8vL8d74nSuuOIKKzw83Dp69Kij7bfffrM8PDys++67r8R5Tvc+KMnYsWMtSdaAAQOc2m+++WaratWqTm0lveb4+HirTp06pS7/8OHDVq1ataxmzZpZaWlppS4rJyfHatq0qdWpUyendkmWh4eH9eeffzq1Dx061JJkrV271tGWlJRkhYSEWJKs3bt3O9p8fHysrl27Wvn5+Y6+r732miXJeueddxxtpa27otvhXr16ubQ9KeqcDMOeM2eOli1bpgULFuj+++/XggULnD6NSHJK6dTUVB05ckTt27dXRkaGtm3b5tQ3NzdXR44c0eHDh7VmzRp9+umnat68uapVq1bi8x84cEDTp0/X008/XewT9sKFC1WtWjU98sgjxeaz2WwlLm/Dhg366KOPNHHiRKfDiKe69957tW3bNsehtoULFyokJESdO3d26vfLL78oKSlJAwc
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 400x800 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAboAAAK9CAYAAABSGqmgAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABsU0lEQVR4nO3dd3gU5doG8Hu2b3pCOgQSivSiQZAmCIFQREBUwAJEj3wqHIWoHLBQVY6oKCCClaLoASx4VAQigihGUHpoEgiCgSSEkJ5sfb8/QvawKbAJG3Yy3L/r4tKdeXb2yWYz9847TRJCCBARESmUytMNEBER1SUGHRERKRqDjoiIFI1BR0REisagIyIiRWPQERGRojHoiIhI0Rh0RESkaAw6IiJSNAYdURWio6Mxfvx4ty5z/PjxiI6OdusyPW3FihWQJAmnTp2qk+XPmjULkiTVybKvt23btkGSJHz++eeebsUtTp06BUmSsGLFCk+3clU3dNAtW7YM8fHxCAsLg1arRXh4OHr37o1Vq1bBbrd7uj2qh86ePYtZs2Zh3759nm5FsV555RWsX7/e021QPXJDB93KlSvh7e2NF198ER9++CGee+45NGzYEOPHj8cDDzzg6faoHjp79ixmz55dZdC9//77OHbs2PVvqh574YUXUFJS4jSNQUc1pfF0A560fft2aLVap2lPPvkkGjRogLfffhvz5s1T3FDTjaq4uBheXl4e7aHiZ42uTqPRQKO5oVdTHiWHvxt3uKG36Kpb8ZSHm0r1v7fn66+/xpAhQxAZGQm9Xo9mzZph7ty5sNlsTs/t06cPJEly/AsODsaQIUOQkpLiVCdJEmbNmuU07bXXXoMkSejTp4/T9NLSUsyaNQs33XQTDAYDIiIicPfdd+PEiRMAqh8rnzhxIiRJctrXVL5PRafT4fz58071ycnJjr7/+OMPp3nr1q1DbGwsjEYjgoOD8eCDDyI9Pb3Se3f06FHcd999CAkJgdFoRMuWLfH8888D+N/+liv927Ztm+N9bNeuXaXlu6L8ubt378btt98OLy8vPPfccwAAk8mEmTNnonnz5tDr9YiKisLUqVNhMpmuuMycnBw888wzaN++PXx8fODn54dBgwZh//79jppt27bh1ltvBQAkJCQ4fqby38vl++gsFguCgoKQkJBQ6bXy8/NhMBjwzDPPOKbVtu/Lvf7665AkCX/99VeledOnT4dOp8PFixcd03bu3ImBAwfC398fXl5e6N27N3bs2OHSa73zzjto27Yt9Ho9IiMjMXHiROTm5laq27lzJwYPHozAwEB4e3ujQ4cOWLhwoWN+xX10kiShqKgIK1eudLy/48ePx9atWyFJEr766qtKr/Hpp59CkiQkJydfseeTJ0/i3nvvRVBQELy8vHDbbbfhu+++c8wv38d2pX8V/6arYrfb8fLLL6NRo0YwGAzo168fUlNTnWp+/vln3HvvvWjcuLHj9z1lyhSnrdsr9XP5F/SarLuq+7vJzc3F+PHj4e/vj4CAAIwbN67K3ycA/Pjjj+jVqxe8vb0REBCAYcOG4ciRI0411e2vrmqfbFJSEnr27ImAgAD4+PigZcuWjr5cxa9KKPslWq1WFBQUYPfu3Xj99dcxevRoNG7c2FGzYsUK+Pj4IDExET4+Pvjxxx8xY8YM5Ofn47XXXnNaXqtWrfD8889DCIETJ05gwYIFGDx4ME6fPn3FHubNm1dpus1mw5133oktW7Zg9OjReOqpp1BQUICkpCSkpKSgWbNmVS4vNTUV77//frWvp1ar8cknn2DKlCmOacuXL4fBYEBpaalT7YoVK5CQkIBbb70V8+bNQ2ZmJhYuXIgdO3Zg7969CAgIAAAcOHAAvXr1glarxYQJExAdHY0TJ07gm2++wcsvv4y7774bzZs3dyx3ypQpaN26NSZMmOCY1rp162p7rokLFy5g0KBBGD16NB588EGEhYXBbrfjrrvuwi+//IIJEyagdevWOHjwIN588038+eefVxwOO3nyJNavX497770XMTExyMzMxLvvvovevXvj8OHDiIyMROvWrTFnzhzMmDEDEyZMQK9evQAA3bt3r7Q8rVaLESNG4Msvv8S7774LnU7nmLd+/XqYTCaMHj0aAK6p78vdd999mDp1KtauXYtnn33Wad7atWsxYMAABAYGAihbWQ0aNAixsbGYOXMmVCoVli9fjr59++Lnn39Gly5dqn2dWbNmYfbs2YiLi8Pjjz+OY8eOYenSpfj999+xY8cOxxfMpKQk3HnnnYiIiMBTTz2F8PBwHDlyBN9++y2eeuqpKpf98ccf4x//+Ae6dOni+Nw0a9YMt912G6KiorB69WqMGDHC6TmrV69Gs2bN0K1bt2p7zszMRPfu3VFcXOwY1Vm5ciXuuusufP755xgxYgRat26Njz/+2PGc9957D0eOHMGbb77pmNahQ4dqX6Pcv//9b6hUKjzzzDPIy8vD/Pnz8cADD2Dnzp2OmnXr1qG4uBiPP/44GjRogF27dmHx4sX4+++/sW7dOgCo1A9Qth5JTExEaGioY1pN1l1V/d0IITBs2DD88ssveOyxx9C6dWt89dVXGDduXKWf7YcffsCgQYPQtGlTzJo1CyUlJVi8eDF69OiBPXv21HiE7NChQ7jzzjvRoUMHzJkzB3q9HqmpqS5/4XIQJFq2bCkAOP6NHTtWWCwWp5ri4uJKz/u///s/4eXlJUpLSx3TevfuLXr37u1U99xzzwkAIisryzENgJg5c6bj8dSpU0VoaKiIjY11ev5HH30kAIgFCxZUen273S6EECItLU0AEMuXL3fMu++++0S7du1EVFSUGDdunGP68uXLBQAxZswY0b59e8f0oqIi4efnJ+6//34BQPz+++9CCCHMZrMIDQ0V7dq1EyUlJY76b7/9VgAQM2bMcEy7/fbbha+vr/jrr7+q7LOiJk2aOPV2ud69e4u2bdtWOe9qevfuLQCIZcuWOU3/+OOPhUqlEj///LPT9GXLlgkAYseOHdX2VlpaKmw2m9Pz0tLShF6vF3PmzHFM+/333yv9LsqNGzdONGnSxPF406ZNAoD45ptvnOoGDx4smjZtWqu+r6Zbt24iNjbWadquXbsEALFq1SohRNnvq0WLFiI+Pt7pd1dcXCxiYmJE//79HdPKP09paWlCCCGysrKETqcTAwYMcHq/3n77bQFAfPTRR0IIIaxWq4iJiRFNmjQRFy9edOrn8tecOXOmqLia8vb2rvJzM336dKHX60Vubq5jWlZWltBoNE5/a1WZPHmyAOD0HhcUFIiYmBgRHR1d6XcvROXf59Vs3bpVABCtW7cWJpPJMX3hwoUCgDh48KBjWlXrm3nz5glJkir9fZWz2+3izjvvFD4+PuLQoUNXXFZ1666q/m7Wr18vAIj58+c7plmtVtGrV69Kn/VOnTqJ0NBQceHCBce0/fv3C5VKJcaOHeuYVt17V/H3/eabbwoA4vz581X+zK66oYcuyy1fvhxJSUlYvXo1HnnkEaxevdppKwMAjEaj4/8LCgqQnZ2NXr16obi4GEePHnWqtVgsyM7Oxvnz55GcnIyvvvoKHTp0QHBwcJWvn56ejsWLF+PFF1+Ej4+P07wvvvgCwcHB+Oc//1npedUddr17926sW7cO8+bNcxp+vdxDDz2Eo0ePOoYov/jiC/j7+6Nfv35OdX/88QeysrLwxBNPwGAwOKYPGTIErVq1cgztnD9/Htu3b8fDDz/stCV8pT6vxmazITs7G9nZ2TCbzTV6rl6vrzQsuG7dOrRu3RqtWrVyLDc7Oxt9+/YFAGzduvWKyyt/L202Gy5cuOAYRtmzZ08Nf7Iyffv2RXBwMNasWeOYdvHiRSQlJWHUqFFu6buiUaNGYffu3Y5hbwBYs2YN9Ho9hg0bBgDYt28fjh8/jvvvvx8XLlxwvF5RURH69euH7du3V3tU8g8//ACz2YzJkyc7ffYeffRR+Pn5OT4ve/fuRVpaGiZPnuwYEShX28/L2LFjYTKZnA7fX7NmDaxWKx588MErPnfDhg3o0qULevbs6Zjm4+ODCRMm4NSpUzh
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 400x800 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYQAAALMCAYAAADkXsVPAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABIGElEQVR4nO3deXxMZ///8feEZJLIhiQIEdvPVlRLKUG0VGqraqslWkH39i6Vuwu3W5toS5dvVS2lrbtoVbWo9m5VbaW62JfaKY2laguVIERkzu8PJ3Mbk0SMkYO8no9HHpzrXOfMZ04m13vOMnNshmEYAgAUez5WFwAAuDoQCAAASQQCAMBEIAAAJBEIAAATgQAAkEQgAABMBAIAQBKBAAAwEQi4akyaNEk2m027du2yuhQnm82m5ORk5/TVWOP1aPHixbLZbJoxY4bVpXjFrl27ZLPZNGnSJKtLKZAlgTB+/HjFx8erXLly8vX1Vfny5RUXF6ePP/5YDofDipKAKyYzM1PJyclavHix1aUABbIkECZPnqxSpUppyJAh+s9//qN//etfqlixonr37q2ePXtaURKuAg899JBOnTqlmJgYq0vJlyc1ZmZmKiUlhUDAVa+kFQ+6ZMkS+fr6urT169dPZcuW1ZgxYzR8+HBVqVLFitJgoRIlSqhEiRJWl1Gga6FGWCMzM1OBgYFWl3FZLNlDuDAMcuWGgI/P/8r6+uuv1bFjR0VFRclut6t69ep65ZVXlJOT47Js69atZbPZnD/h4eHq2LGjNm7c6NLvwmPCkvTWW2/JZrOpdevWLu2nT59WcnKyatasKX9/f1WoUEH33HOPdu7cKSn/44JPP/20bDabevfu7WzLPfbs5+enw4cPu/RfunSps+5Vq1a5zJs+fboaNWqkgIAAhYeH68EHH9S+ffvctt3WrVt1//33KyIiQgEBAapVq5YGDx4sSUpOTnbZNnn95L57bd26terVq+e2/sLIXXb9+vWKi4tTYGCgatSo4TwO/OOPP6pp06bO+hYsWOCyfF7H51etWqX4+HiFh4crICBAVatWVd++fV2WmzZtmho1aqTg4GCFhISofv36evfddy+p9qysLA0YMEAREREKDg7WXXfdpT///NOt36XWuGvXLkVEREiSUlJSnNs79zW4fv169e7dW9WqVZO/v7/Kly+vvn376siRIy6Pm/s73LFjh3r37q2wsDCFhoaqT58+yszMdKtzypQpatKkiQIDA1W6dGm1atVK8+bNc+kzZ84ctWzZUqVKlVJwcLA6duyoTZs2FWp7/fHHH+rWrZvKlCmjwMBA3XrrrZo9e7Zzfu45gIJ+Lvw7zIvD4dBrr72mSpUqyd/fX23atNGOHTtc+vz000/q1q2bKleuLLvdrujoaA0YMECnTp0qVD3nv/m8lPGmXr16Wr16tVq1aqXAwED961//kiQdO3ZMvXv3VmhoqMLCwpSYmKhjx47l+fx++OEH5+8gLCxMXbp00ZYtW1z69O7dO883yLmvifPNnz9fLVq0UFhYmIKCglSrVi1nXYVhyR5CrmPHjuns2bM6fvy4Vq9erf/7v/9T9+7dVblyZWefSZMmKSgoSElJSQoKCtIPP/ygl156SRkZGXrrrbdc1le7dm0NHjxYhmFo586dGjFihDp06KA9e/YUWMPw4cPd2nNyctSpUyctXLhQ3bt3V//+/XX8+HHNnz9fGzduVPXq1fNc344dO/Thhx/m+3glSpTQlClTNGDAAGfbxIkT5e/vr9OnT7v0nTRpkvr06aNbbrlFw4cP18GDB/Xuu+/ql19+0dq1axUWFibp3KDSsmVL+fr66rHHHlOVKlW0c+dOffPNN3rttdd0zz33qEaNGs71DhgwQHXq1NFjjz3mbKtTp06+NV+Kv//+W506dVL37t3VrVs3jRs3Tt27d9enn36qZ599Vk888YQSEhL01ltv6b777tPevXsVHByc57oOHTqkdu3aKSIiQgMHDlRYWJh27dqlL7/80tln/vz56tGjh9q0aaM33nhDkrRlyxb98ssv6t+/f6HrfuSRRzRlyhQlJCSoefPm+uGHH9SxY8eLLnexGiMiIjRu3Dg9+eST6tq1q+655x5JUoMGDZz1//HHH+rTp4/Kly+vTZs26YMPPtCmTZu0bNkytz/4+++/X1WrVtXw4cO1Zs0aTZgwQZGRkc7nLp0LnuTkZDVv3lxDhw6Vn5+fli9frh9++EHt2rWTJH3yySdKTExUfHy83njjDWVmZmrcuHFq0aKF1q5dW+Ae+sGDB9W8eXNlZmY69+wnT56su+66SzNmzFDXrl1Vp04dffLJJ85lPvjgA23ZskXvvPOOsy13GxTk9ddfl4+Pj5577jmlp6frzTffVM+ePbV8+XJnn+nTpyszM1NPPvmkypYtqxUrVmj06NH6888/NX36dElyq0c697eflJSkyMhIZ9uljDdHjhxR+/bt1b17dz344IMqV66cDMNQly5d9PPPP+uJJ55QnTp1NGvWLCUmJro9twULFqh9+/aqVq2akpOTderUKY0ePVqxsbFas2bNJR8l2bRpkzp16qQGDRpo6NChstvt2rFjh3755ZfCr8SwUK1atQxJzp9evXoZ2dnZLn0yMzPdlnv88ceNwMBA4/Tp0862uLg4Iy4uzqXfv/71L0OScejQIWebJOPll192Tr/wwgtGZGSk0ahRI5flP/roI0OSMWLECLfHdzgchmEYRmpqqiHJmDhxonPe/fffb9SrV8+Ijo42EhMTne0TJ040JBk9evQw6tev72w/efKkERISYiQkJBiSjJUrVxqGYRhnzpwxIiMjjXr16hmnTp1y9v/2228NScZLL73kbGvVqpURHBxs7N69O886LxQTE+NS2/ni4uKMG264Ic95FxMXF2dIMqZOneps27p1qyHJ8PHxMZYtW+Zsnzt3rtu2y91GqamphmEYxqxZs1y2SV769+9vhISEGGfPnvWoZsMwjHXr1hmSjKeeesqlPfd3cv7rxZMaDx8+7LaeXHm9vj/77DNDkrFkyRJn28svv2xIMvr27evSt2vXrkbZsmWd07///rvh4+NjdO3a1cjJyXHpm/t6OH78uBEWFmY8+uijLvMPHDhghIaGurVf6NlnnzUkGT/99JOz7fjx40bVqlWNKlWquD2uYRhGYmKiERMTU+B6z7do0SJDklGnTh0jKyvL2f7uu+8akowNGzY42/LahsOHDzdsNpvb30Quh8NhdOrUyQgKCjI2bdpU4LryG28kGePHj3fp+9VXXxmSjDfffNPZdvbsWaNly5Zur/eGDRsakZGRxpEjR5xtv/32m+Hj42P06tXL2Zbftst9TeR65513DEnG4cOH83zOhWHpZacTJ07U/Pnz9emnn+rhhx/Wp59+6vKuVZICAgKc/z9+/LjS0tLUsmVLZWZmauvWrS59s7OzlZaWpsOHD2vp0qWaNWuWGjRooPDw8Dwff9++fRo9erSGDBmioKAgl3kzZ85UeHi4nnnmGbflLnzXlmv16tWaPn26hg8f7nLY63wPPfSQtm7d6jw0NHPmTIWGhqpNmzYu/VatWqVDhw7pqaeekr+/v7O9Y8eOql27tnP3/PDhw1qyZIn69u3rsmdVUJ0Xk5OTo7S0NKWlpenMmTOXtGxQUJC6d+/unK5Vq5bCwsJUp04dNW3a1Nme+/8//vgj33Xl7gF9++23ys7OzrfPyZMnNX/+/Euq83zfffedpHPnsc737LPPXnTZwtRYkPNf36dPn1ZaWppuvfVWSdKaNWvc+j/xxBMu0y1bttSRI0eUkZEhSfrqq6/kcDj00ksvub0Gc18P8+fP17Fjx9SjRw/n7zktLU0lSpRQ06ZNtWjRogJr/u6779SkSRO1aNHC2RYUFKTHHntMu3bt0ubNmy9hCxSsT58+8vPzc063bNlSkuvr5vxtePLkSaWlpal58+YyDENr167Nc72vvPKKvv32W02aNEl169bNc10XG2/sdrv69Onj0vbdd9+pZMmSevLJJ51tJUqUcBtH9u/fr3Xr1ql3794qU6aMs71Bgwa64447nK/JS5H7Wvz
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 400x800 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAacAAAK9CAYAAACASqP4AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABc3klEQVR4nO3dd3gU1f4G8Hc22ZJeSCMQSAjdUBSkS4dIk2YBC6EoemlSvCgqVa9RvCLS5SdSRCyAYEWqgCKConRCTQAhCSSkkL7l/P7IZm82u4HNssmO2ffzPHlgz5yd/e5kM+/OzJkZSQghQEREJCMKZxdARERUFsOJiIhkh+FERESyw3AiIiLZYTgREZHsMJyIiEh2GE5ERCQ7DCciIpIdhhMREckOw4lsNnLkSHh7e1fpayYlJUGSJKxZs6ZKX9dVzJkzB5IkVdnrSZKEOXPmVNnrOUJkZCT69+/v7DIcpmvXrujatauzy7irCoXTihUrEBsbi9DQUCiVSoSFhaFLly5Yt24dDAZDZdVIZJe8vDzMmTMHe/fudXYp/yhvvfUWtm7d6uwyyMVVKJzWrl0LLy8vzJw5E6tWrcKrr76KWrVqYeTIkXjqqacqq0Yiu+Tl5WHu3LkMpzt4/fXXkZ+fb9bGcCI5cK9I5/3790OpVJq1TZo0CTVq1MCSJUsQHx+PyMhIR9ZHRJXI3d0d7u4VWg2QzOXm5sLLy8vZZdyzCm05lQ2mEiWBpFD8b3Zff/01+vXrh/DwcKjVakRHR+ONN96AXq83e27Xrl0hSZLpJygoCP369cPJkyfN+lnbV/3uu+9CkiSL/acFBQWYM2cOGjZsCI1Gg5o1a2LIkCG4ePEigPKPY4wfPx6SJGHkyJGmtjVr1kCSJKhUKty8edOs/8GDB011//HHH2bTNm7ciFatWsHDwwNBQUF4+umnce3aNYtll5CQgMcffxzBwcHw8PBAo0aN8NprrwH43/GAO/2UbBV07doVMTExFvO3xc8//4zHHnsMderUgVqtRkREBKZMmWLxjbrEpUuXEBsbCy8vL4SHh2PevHkoe3H7zz//HK1atYKPjw98fX3RrFkzfPDBBxbzeeyxxxAYGAhPT0+0a9cO33///V3rLW+f+ciRI02fxaSkJAQHBwMA5s6da1pepT9DCQkJePTRRxEYGAiNRoPWrVvjm2++uevrlyVJEiZMmICNGzeiadOm8PDwQPv27XHixAkAwIcffoj69etDo9Gga9euSEpKMnt+RZZ/yWtoNBrExMRgy5YtZu+75L1LkoT//ve/WLlyJaKjo6FWq/Hggw/i999/N5tf2WNOkiQhNzcXa9euNS2zkr+Hsq9T3jwAoLCwEFOmTEFwcDB8fHzwyCOP4O+//7a6/K5du4bRo0cjNDQUarUa9913Hz7++OPyFrcZnU6HN954w/QeIyMj8eqrr6KwsNDUJzIy8o5/Q7Z+of7ll1/Qpk0baDQa1KtXD+vWrTObfuvWLbz00kto1qwZvL294evriz59+uDYsWNm/e5UT8nf8+XLlzFu3Dg0atQIHh4eqFGjBh577DGLz07J+mnfvn0YN24cQkJCULt2bdP0kt+/h4cH2rRpg59//tnqe7tx4wbGjBmD0NBQaDQatGjRAmvXrjXrs3fvXrMaS1hbn6akpGDUqFGoXbs21Go1atasiYEDB1rUfyd2fWXKzMyETqfD7du3ceTIEfz3v//FsGHDUKdOHVOfNWvWwNvbG1OnToW3tzf27NmDWbNmITs7G++++67Z/Bo3bozXXnsNQghcvHgRCxYsQN++fXHlypU71hAfH2/Rrtfr0b9/f+zevRvDhg3Diy++iNu3b2Pnzp04efIkoqOjrc7vwoUL+L//+79yX8/NzQ3r16/HlClTTG2rV6+GRqNBQUGBWd81a9Zg1KhRePDBBxEfH4/U1FR88MEHOHDgAP766y/4+/sDAI4fP46HHnoISqUSY8eORWRkJC5evIhvv/0W//nPfzBkyBDUr1/fNN8pU6agSZMmGDt2rKmtSZMm5dZsq40bNyIvLw//+te/UKNGDRw+fBiLFy/G33//jY0bN5r11ev1ePjhh9GuXTvMnz8fP/74I2bPng2dTod58+YBAHbu3Inhw4ejR48eeOeddwAAZ86cwYEDB/Diiy8CAFJTU9GhQwfk5eWZtr7Xrl2LRx55BJs2bcLgwYPv6T0FBwdj+fLl+Ne//oXBgwdjyJAhAIDmzZsDAE6dOoWOHTuiVq1aeOWVV+Dl5YUvv/wSgwYNwubNmyv8+j///DO++eYbjB8/HgAQHx+P/v37Y/r06Vi2bBnGjRuHjIwMzJ8/H6NHj8aePXtMz7V1+X///fd44okn0KxZM8THxyMjIwNjxoxBrVq1rNa0YcMG3L59G88//zwkScL8+fMxZMgQXLp0qdwvmp988gmeffZZtGnTxvQ5K+9v5k6effZZrF+/Hk8++SQ6dOiAPXv2oF+/fhb9UlNT0a5dO1PABwcHY9u2bRgzZgyys7MxefLku77O2rVr8eijj2LatGk4dOgQ4uPjcebMGWzZsgUAsHDhQuTk5AAo/hy+9dZbePXVV01/O7YM8rlw4QIeffRRjBkzBnFxcfj4448xcuRItGrVCvfddx+A4i9bW7duxWOPPYaoqCikpqbiww8/RJcuXXD69GmEh4db1FPi/fffx9GjR1GjRg0AwO+//45ff/0Vw4YNQ+3atZGUlITly5eja9euOH36NDw9Pc2eP27cOAQHB2PWrFnIzc0FAKxatQrPP/88OnTogMmTJ+PSpUt45JFHEBgYiIiICNNz8/Pz0bVrV1y4cAETJkxAVFQUNm7ciJEjRyIzM9P0N1sRQ4cOxalTpzBx4kRERkbixo0b2LlzJ65cuWL73jVhh0aNGgkApp8RI0YIrVZr1icvL8/iec8//7zw9PQUBQUFprYuXbqILl26mPV79dVXBQBx48YNUxsAMXv2bNPj6dOni5CQENGqVSuz53/88ccCgFiwYIHF6xsMBiGEEImJiQKAWL16tWna448/LmJiYkRERISIi4szta9evVoAEMOHDxfNmjUztefm5gpfX1/x5JNPCgDi999/F0IIUVRUJEJCQkRMTIzIz8839f/uu+8EADFr1ixTW+fOnYWPj4+4fPmy1TrLqlu3rlltpXXp0kXcd999VqfdjbXfVXx8vJAkyay2uLg4AUBMnDjRrNZ+/foJlUolbt68KYQQ4sUXXxS+vr5Cp9OV+5qTJ08WAMTPP/9sart9+7aIiooSkZGRQq/XCyGs/66sfWZK6qtbt67p8c2bNy0+NyV69OghmjVrZvZZNBgMokOHDqJBgwbl1m0NAKFWq0ViYqKp7cMPPxQARFhYmMjOzja1z5gxQwAw62vr8m/WrJmoXbu2uH37tqlt7969AoDZ+y5ZZjVq1BC3bt0ytX/99dcCgPj2229NbbNnzxZlVwNeXl5WP2dll2958zh69KgAIMaNG2fWr+RvpfTvY8yYMaJmzZoiLS3NrO+wYcOEn5+f1WVT9nWeffZZs/aXXnpJABB79uyxeM5PP/0kAIiffvqp3PmWVbduXQFA7N+/39R248YNoVarxbRp00xtBQUFps9ticTERKFWq8W8efPKnf+XX34pAJj1sfa+Dx48KACIdevWmdpK1k+dOnUy+3srWQ+1bNlSFBYWmtpXrlwpAJj9/SxcuFAAEOvXrzd7fvv27YW3t7fp81vesiv7N5qRkSEAiHfffbfc92wLu4aSr169Gjt37sSnn36KMWPG4NNPPzX7Ng8AHh4epv/fvn0baWlpeOihh5CXl4eEhASzvlqtFmlpabh58yYOHjyILVu2oHnz5ggKCrL6+teuXcPixYsxc+ZMi289mzdvRlBQECZOnGjxvPKGzB45cgQbN25EfHy82a7J0p555hkkJCSYdt9t3rwZfn5+6NGjh1m/P/74Azdu3MC4ceOg0WhM7f369UPjxo1Nu61u3ryJ/fv3Y/To0WZbnHeq8270ej3S0tKQlpaGoqIim59X+neVm5uLtLQ0dOjQAUII/PXXXxb9J0yYYFbrhAkTUFR
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 400x800 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Список числовых колонок, для которых строим графики, чтобы найти зависимости опасности от других колонок\n",
|
|||
|
"numeric_columns = ['est_diameter_min', 'est_diameter_max', 'relative_velocity', 'miss_distance', 'absolute_magnitude']\n",
|
|||
|
"\n",
|
|||
|
"# Создание диаграмм зависимости\n",
|
|||
|
"for column in numeric_columns:\n",
|
|||
|
" plt.figure(figsize=(4, 8)) # Установка размера графика\n",
|
|||
|
" plt.scatter(df2['hazardous'], df2[column], alpha=0.5) # Создаем диаграмму рассеяния\n",
|
|||
|
" plt.title(f'Зависимость {column} от hazardous')\n",
|
|||
|
" plt.xlabel('hazardous (0 = нет, 1 = да)')\n",
|
|||
|
" plt.ylabel(column)\n",
|
|||
|
" plt.xticks([0, 1]) # Установка меток по оси X\n",
|
|||
|
" plt.grid() # Добавление сетки для удобства восприятия\n",
|
|||
|
" plt.show() # Отображение графика"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 51,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"#Разбиение набора данных на обучающую, контрольную и тестовую выборки\n",
|
|||
|
"def split_stratified_into_train_val_test(\n",
|
|||
|
" df_input,\n",
|
|||
|
" stratify_colname=\"y\",\n",
|
|||
|
" frac_train=0.6,\n",
|
|||
|
" frac_val=0.15,\n",
|
|||
|
" frac_test=0.25,\n",
|
|||
|
" random_state=None,\n",
|
|||
|
"):\n",
|
|||
|
"\n",
|
|||
|
" if frac_train + frac_val + frac_test != 1.0:\n",
|
|||
|
" raise ValueError(\n",
|
|||
|
" \"fractions %f, %f, %f do not add up to 1.0\"\n",
|
|||
|
" % (frac_train, frac_val, frac_test)\n",
|
|||
|
" )\n",
|
|||
|
"\n",
|
|||
|
" if stratify_colname not in df_input.columns:\n",
|
|||
|
" raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
|
|||
|
"\n",
|
|||
|
" X = df_input # Contains all columns.\n",
|
|||
|
" y = df_input[\n",
|
|||
|
" [stratify_colname]\n",
|
|||
|
" ] # Dataframe of just the column on which to stratify.\n",
|
|||
|
"\n",
|
|||
|
" # Split original dataframe into train and temp dataframes.\n",
|
|||
|
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
|
|||
|
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
|
|||
|
" )\n",
|
|||
|
"\n",
|
|||
|
" # Split the temp dataframe into val and test dataframes.\n",
|
|||
|
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
|
|||
|
" df_val, df_test, y_val, y_test = train_test_split(\n",
|
|||
|
" df_temp,\n",
|
|||
|
" y_temp,\n",
|
|||
|
" stratify=y_temp,\n",
|
|||
|
" test_size=relative_frac_test,\n",
|
|||
|
" random_state=random_state,\n",
|
|||
|
" )\n",
|
|||
|
"\n",
|
|||
|
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
|
|||
|
"\n",
|
|||
|
" return df_train, df_val, df_test"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 53,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"hazardous\n",
|
|||
|
"False 81996\n",
|
|||
|
"True 8840\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Обучающая выборка: (54501, 6)\n",
|
|||
|
"hazardous\n",
|
|||
|
"False 49197\n",
|
|||
|
"True 5304\n",
|
|||
|
"Name: count, dtype: int64\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAgkAAADECAYAAAAVi7K7AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA9TklEQVR4nO3dd1xT1/sH8E8SIGHvLbLBgQNxtFbBjavUVVu3WBxVa7X6tdXWgdWidddtHVDxa61StVonbqxVceBWkOFmbwiQ5Pz+4Jd8CUkQELwgz/v1yktzOffc5557c/PknDt4jDEGQgghhJBy+FwHQAghhJC6iZIEQgghhKhFSQIhhBBC1KIkgRBCCCFqUZJACCGEELUoSSCEEEKIWpQkEEIIIUQtShIIIYQQopYW1wEQQkhDUFxcjIyMDMhkMtjZ2XEdDqlBYrEYGRkZ0NLSgpWVFdfh1CjqSSCkDhg7diwMDAy4DqPGLFy4EDwej+swOBcdHY3hw4fDwsICQqEQtra2GDx4MNdh1Rvr169HVlaW4v2aNWuQn5/PXUBlREZGIiAgACYmJtDV1YW9vT2+/vprrsOqcVXqSQgNDUVgYKDivVAoROPGjdGrVy/MmzcP1tbWNR4gIYTUR4cOHcJnn32GJk2aYMmSJXB1dQWA9+6XZm06fPgw4uLiMHPmTFy4cAHz5s3DtGnTuA4LGzduxFdffYVOnTph7dq1sLe3BwA4OjpyHFnNq9Zww6JFi+Ds7AyxWIyoqChs2rQJR48exd27d6Gnp1fTMRJCSL2SkZGBoKAg+Pv7Y9++fdDR0eE6pHpp7ty5CAgIwNq1a8Hn87Fy5Urw+dx2gMfGxuKbb77BhAkTsHHjxve+x6xaSUKfPn3Qtm1bAEBQUBDMzc2xatUqHDp0CMOGDavRAAkhdY9EIoFMJqMvPw127twJsViM0NBQaqO34Ofnh6SkJDx48AAODg5o1KgR1yHhl19+gY2NDX755Zf3PkEAauichG7dugEAEhISAJRm0bNmzUKLFi1gYGAAIyMj9OnTBzExMSrzisViLFy4EB4eHhCJRLC1tcWgQYPw5MkTAEBiYiJ4PJ7GV5cuXRR1nTt3DjweD3v37sXcuXNhY2MDfX19BAQE4NmzZyrLvnLlCnr37g1jY2Po6enBz88Ply5dUruOXbp0Ubv8hQsXqpQNDw+Hj48PdHV1YWZmhs8//1zt8itat7JkMhnWrFmD5s2bQyQSwdraGhMnTkRmZqZSOScnJ/Tv319lOVOnTlWpU13sy5cvV2lTACgqKsKCBQvg5uYGoVAIBwcHzJ49G0VFRWrbqqwuXbqo1LdkyRLw+Xz897//rVZ7rFixAh07doS5uTl0dXXh4+OD/fv3q11+eHg42rdvDz09PZiamsLX1xcnT55UKnPs2DH4+fnB0NAQRkZGaNeunUps+/btU2xTCwsLjBw5Ei9evFAqM3bsWKWYTU1N0aVLF1y8ePGN7ST34sULDBgwAAYGBrC0tMSsWbMglUqrvP7lY1G3zxYXF2P+/Pnw8fGBsbEx9PX10blzZ5w9e1apLvl2WbFiBdasWQNXV1cIhULcv38fABAVFYV27dpBJBLB1dUVW7ZsUbtuEokEP/74o2J+JycnzJ07V2U/0vS5cnJywtixYxXvS0pKEBwcDHd3d4hEIpibm6NTp044depUhW0cGhqq1B56enpo0aIFtm3bVuF8cvHx8fj0009hZmYGPT09fPDBB/j777+Vyvz7779o3bo1fvrpJzg4OEAoFMLd3R1Lly6FTCZTlPPz80OrVq3ULsfT0xP+/v5KMScmJiqVKf/5quw2BVTb+fXr1xg9ejQsLS0hFArh5eWFX3/9VWmesvtCWV5eXiqf8xUrVqiN+cWLFxg3bhysra0hFArRvHlz7NixQ6mM/Fh+7tw5mJiY4MMPP0SjRo3Qr18/jfuHuvnlL6FQCA8PD4SEhKDsg4/l586kpaVprKv8fvfvv//Cx8cHkydPVqyDurYCgPz8fMycOVOxD3h6emLFihUo//BlHo+HqVOnYvfu3fD09IRIJIKPjw8uXLigVE7duT5nz56FUCjEpEmTlKZXpp0ro0aubpB/oZubmwMo/RAdPHgQn376KZydnZGcnIwtW7bAz88P9+/fV5zZK5VK0b9/f5w+fRqff/45vv76a+Tm5uLUqVO4e/euYgwPAIYNG4a+ffsqLXfOnDlq41myZAl4PB6+/fZbpKSkYM2aNejRowdu3boFXV1dAMCZM2fQp08f+Pj4YMGCBeDz+di5cye6deuGixcvon379ir1NmrUCCEhIQCAvLw8fPnll2qXPW/ePAwdOhRBQUFITU3FunXr4Ovri5s3b8LExERlngkTJqBz584AgD///BMHDhxQ+vvEiRMV54NMmzYNCQkJWL9+PW7evIlLly5BW1tbbTtURVZWlmLdypLJZAgICEBUVBQmTJiApk2b4s6dO1i9ejUeP36MgwcPVmk5O3fuxA8//ICVK1di+PDhasu8qT3Wrl2LgIAAjBgxAsXFxfj999/x6aef4siRI+jXr5+iXHBwMBYuXIiOHTti0aJF0NHRwZUrV3DmzBn06tULQOnBd9y4cWjevDnmzJkDExMT3Lx5E8ePH1fEJ2/7du3aISQkBMnJyVi7di0uXbqksk0tLCywevVqAMDz58+xdu1a9O3bF8+ePVO77cuSSqXw9/dHhw4dsGLFCkRGRmLlypVwdXVV2tcqs/4TJ05Ejx49lOo/fvw4du/erRgTz8nJwbZt2zBs2DCMHz8eubm52L59O/z9/XH16lW0bt1aZduJxWJMmDABQqEQZmZmuHPnDnr16gVLS0ssXLgQEokECxYsUHt+UlBQEMLCwjBkyBDMnDkTV65cQUhICB48eKCyjStj4cKFCAkJQVBQENq3b4+cnBxER0fjxo0b6Nmz5xvnX716NSwsLJCTk4MdO3Zg/PjxcHJyUmm3spKTk9GxY0cUFBRg2rRpMDc3R1hYGAICArB//34MHDgQAJCeno6oqChERUVh3Lhx8PHxwenTpzFnzhwkJiZi8+bNAIBRo0Zh/PjxuHv3Lry8vBTLuXbtGh4/fowffvihSm1S1W0qV1xcjB49euDhw4f48ssv4enpiYMHD2LChAlIT0/Hd999V6U4NElOTsYHH3yg+FK0tLTEsWPH8MUXXyAnJwfTp0/XOO+FCxdw9OjRKi1v7ty5aNq0KQoLCxU/Hq2srPDFF19Uex3S09MRHR0NLS0tTJkyBa6urmrbijGGgIAAnD17Fl988QVat26NEydO4D//+Q9evHihOE7InT9/Hnv37sW0adMgFAqxceNG9O7dG1evXlXaN8qKiYnBgAED0LdvX2zYsEEx/W3aWQWrgp07dzIALDIykqWmprJnz56x33//nZmbmzNdXV32/PlzxhhjYrGYSaVSpXkTEhKYUChkixYtUkzbsWMHA8BWrVqlsiyZTKaYDwBbvny5SpnmzZszPz8/xfuzZ88yAMze3p7l5OQopv/xxx8MAFu7dq2ibnd3d+bv769YDmOMFRQUMGdnZ9azZ0+VZXXs2JF5eXkp3qempjIAbMGCBYppiYmJTCAQsCVLlijNe+fOHaalpaUyPTY2lgFgYWFhimkLFixgZTfLxYsXGQC2e/dupXmPHz+uMt3R0ZH169dPJfYpU6aw8pu6fOyzZ89mVlZWzMfHR6lNd+3axfh8Prt48aLS/Js3b2YA2KVLl1SWV5afn5+ivr///ptpaWmxmTNnqi1bmfZgrHQ7lVVcXMy8vLxYt27dlOri8/ls4MCBKvuifJtnZWUxQ0ND1qFDB1ZYWKi2THFxMbOysmJeXl5KZY4cOcIAsPnz5yumjRkzhjk6OirVs3XrVgaAXb16Ve06l50XgNLngzHGvL29mY+PT5XXv7zY2FhmbGzMevbsySQSCWOMMYlEwoqKipTKZWZmMmtrazZu3DjFNPln0MjIiKWkpCiVHzBgABOJRCwpKUkx7f79+0wgEChtt1u3bjEALCgoSGn+WbNmMQDszJkzimnl9005R0dHNmbMGMX7Vq1aqd3f30R
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 200x200 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Контрольная выборка: (18167, 6)\n",
|
|||
|
"hazardous\n",
|
|||
|
"False 16399\n",
|
|||
|
"True 1768\n",
|
|||
|
"Name: count, dtype: int64\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAhUAAADECAYAAAAoGdPdAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA8hUlEQVR4nO3dd1wT9/8H8FcSQgJhLxm1iAxRUaso1oG4EbEWZ11V6Re1tWhttba2PxWtLXXUUXdbB47aCrbSasW9UOsGtyJLRURA2QRI8vn9wTf5EgJIIHCA7+fjkYfmuPvc++5yl3c+447HGGMghBBCCKklPtcBEEIIIaRpoKSCEEIIITpBSQUhhBBCdIKSCkIIIYToBCUVhBBCCNEJSioIIYQQohOUVBBCCCFEJyipIIQQQohOUFJBCCGE1LOsrCw8fPgQMpmM61B0ipIKQhqAyZMnw8jIiOswdCYkJAQ8Ho/rMMhr5smTJ9i+fbvqfVJSEnbv3s1dQGWUlJRg2bJl6NChA0QiEczNzeHq6orjx49zHZpOaZVUbN++HTweT/USi8Vwc3NDcHAw0tLS6ipGQggh5JV4PB4+/vhjHD58GElJSZg7dy7Onj3LdVgoKipC//79MX/+fPTu3Rvh4eE4evQoTpw4gW7dunEdnk7p1WShxYsXw8nJCVKpFNHR0di4cSP++ecf3Lp1C4aGhrqOkRBCCHklBwcHTJkyBYMGDQIA2NnZ4dSpU9wGBWDp0qW4ePEiDh8+jN69e3MdTp2qUVLh5+eHzp07AwCCgoJgaWmJlStXIjIyEmPHjtVpgISQhkcmk0GhUEBfX5/rUAhRs3r1asyYMQMZGRnw8PCARCLhNB6ZTIbVq1dj9uzZTT6hAHTUp6Jv374AgMTERADAixcvMGfOHLRr1w5GRkYwMTGBn58fYmNjNZaVSqUICQmBm5sbxGIx7OzsMHz4cMTHxwMobRMr2+RS/lX2IJ06dQo8Hg+///47vvrqK9ja2kIikWDo0KF4/PixxrovXryIQYMGwdTUFIaGhvDx8cG5c+cq3MbevXtXuP6QkBCNeXft2gVPT08YGBjAwsICY8aMqXD9VW1bWQqFAqtXr0bbtm0hFovRrFkzTJs2DS9fvlSbr0WLFhgyZIjGeoKDgzXKrCj25cuXa+xToLTqbuHChXBxcYFIJELz5s0xd+5cFBUVVbivyurdu7dGed9++y34fD5+/fXXGu2PFStWoHv37rC0tISBgQE8PT0RERFR4fp37doFLy8vGBoawtzcHL169cKRI0fU5jl06BB8fHxgbGwMExMTdOnSRSO28PBw1TG1srLChAkTkJKSojbP5MmT1WI2NzdH7969tap+TUlJQUBAAIyMjGBtbY05c+ZALpdrvf3lY6noM1tcXIwFCxbA09MTpqamkEgk8Pb2xsmTJ9XKUh6XFStWYPXq1XB2doZIJMKdO3cAANHR0ejSpQvEYjGcnZ2xefPmCrdNJpPhm2++US3fokULfPXVVxqfo8rOqxYtWmDy5Mmq9yUlJVi0aBFcXV0hFothaWmJnj174ujRo1Xu4/LNuIaGhmjXrh1++eWXKpcru2xSUpJq2u3bt2Fubo4hQ4aodbpLSEjAqFGjYGFhAUNDQ7z99ts4ePCgWnnKa1ZFn18jIyPV9paPuaKXsi+Bsn9OQkICfH19IZFIYG9vj8WLF6P8Q6nz8/Mxe/ZsNG/eHCKRCK1atcKKFSs05qsqhrLnt3KeK1euVLkfK+tDFBERAR6Pp1G7UN3zr0WLFgAAZ2dndO3aFS9evICBgYHGMasspuqcv5VdZ5WUx1S5Dffv38fLly9hbGwMHx8fGBoawtTUFEOGDMGtW7c0lr9+/Tr8/PxgYmICIyMj9OvXD//++6/aPMr9fObMGUybNg2WlpYwMTHBxIkTK/xeKHveAMDUqVMhFos19vOhQ4fg7e0NiUQCY2Nj+Pv74/bt21Xut/JqVFNRnjIBsLS0BFB6Mu3fvx+jRo2Ck5MT0tLSsHnzZvj4+ODOnTuwt7cHAMjlcgwZMgTHjx/HmDFj8MknnyA3NxdHjx7FrVu34OzsrFrH2LFjMXjwYLX1zps3r8J4vv32W/B4PHzxxRd4/vw5Vq9ejf79+yMmJgYGBgYAgBMnTsDPzw+enp5YuHAh+Hw+tm3bhr59++Ls2bPw8vLSKPeNN95AaGgoACAvLw8fffRRheueP38+Ro8ejaCgIKSnp2Pt2rXo1asXrl+/DjMzM41lpk6dCm9vbwDAH3/8gT///FPt79OmTcP27dsRGBiImTNnIjExEevWrcP169dx7tw5CIXCCveDNrKyslTbVpZCocDQoUMRHR2NqVOnonXr1rh58yZWrVqFBw8eYP/+/VqtZ9u2bfi///s//PDDDxg3blyF87xqf6xZswZDhw7F+PHjUVxcjN9++w2jRo3CgQMH4O/vr5pv0aJFCAkJQffu3bF48WLo6+vj4sWLOHHiBAYOHAig9OT84IMP0LZtW8ybNw9mZma4fv06oqKiVPEp932XLl0QGhqKtLQ0rFmzBufOndM4plZWVli1ahWA0k5ja9asweDBg/H48eMKj31Zcrkcvr6+6Nq1K1asWIFjx47hhx9+gLOzs9pnrTrbP23aNPTv31+t/KioKOzevRs2NjYAgJycHPzyyy8YO3YspkyZgtzcXGzZsgW+vr64dOkS3nrrLY1jJ5VKMXXqVIhEIlhYWODmzZsYOHAgrK2tERISAplMhoULF6JZs2Ya2xcUFISwsDCMHDkSs2fPxsWLFxEaGoq7d+9qHOPqCAkJQWhoKIKCguDl5YWcnBxcuXIF165dw4ABA165/KpVq2BlZYWcnBxs3boVU6ZMQYsWLTT2W1UeP36MQYMGwd3dHXv37oWeXuklNS0tDd27d0dBQQFmzpwJS0tLhIWFYejQoYiIiMCwYcO02tZevXph586dqvfffvstAODrr79WTevevbvq/3K5HIMGDcLbb7+NZcuWISoqCgsXLoRMJsPixYsBAIwxDB06FCdPnsR//vMfvPXWWzh8+DA+//xzpKSkqD7H5Sn3W9k46pI25195CxYsgFQqrfa6anP+ViYzMxNA6feVq6srFi1aBKlUivXr16NHjx64fPky3NzcAJQmqN7e3jAxMcHcuXMhFAqxefNm9O7dG6dPn0bXrl3Vyg4ODoaZmRlCQkJw//59bNy4EcnJyarEpiILFy7Eli1b8Pvvv6slhDt37sSkSZPg6+uLpUuXoqCgABs3bkTPnj1x/fp1VcL2SkwL27ZtYwDYsWPHWHp6Onv8+DH77bffmKWlJTMwMGBPnjxhjDEmlUqZXC5XWzYxMZGJRCK2ePFi1bStW7cyAGzlypUa61IoFKrlALDly5drzNO2bVvm4+Ojen/y5EkGgDk4OLCcnBzV9L179zIAbM2aNaqyXV1dma+vr2o9jDFWUFDAnJyc2IABAzTW1b17d+bh4aF6n56ezgCwhQsXqqYlJSUxgUDAvv32W7Vlb968yfT09DSmx8XFMQAsLCxMNW3hwoWs7GE5e/YsA8B2796ttmxUVJTGdEdHR+bv768R+8cff8zKH+rysc+dO5fZ2NgwT09PtX26c+dOxufz2dmzZ9WW37RpEwPAzp07p7G+snx8fFTlHTx4kOnp6bHZs2dXOG919gdjpceprOLiYubh4cH69u2rVhafz2fDhg3T+Cwqj3lWVhYzNjZmXbt2ZYWFhRXOU1xczGxsbJiHh4faPAcOHGAA2IIFC1TTJk2axBwdHdXK+emnnxgAdunSpQq3ueyyANTOD8YY69ixI/P09NR6+8uLi4tjpqambMCAAUwmkzHGGJPJZKyoqEhtvpcvX7JmzZqxDz74QDVNeQ6amJiw58+fq80fEBDAxGIxS05OVk27c+cOEwgEasctJiaGAWBBQUFqy8+ZM4cBYCdOnFBNK//ZVHJ0dGSTJk1Sve/QoUOFn/dXUV7HEhMTVdMePHjAALBly5ZVe9kXL16wNm3asFatWrGMjAy1+WbNmsUAqJ03ubm5zMnJibVo0UL1mVRes8LDwzXWJZFI1La3rLL
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 200x200 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Тестовая выборка: (18168, 6)\n",
|
|||
|
"hazardous\n",
|
|||
|
"False 16400\n",
|
|||
|
"True 1768\n",
|
|||
|
"Name: count, dtype: int64\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAfQAAADECAYAAABp29OTAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA5wElEQVR4nO3dd1xT1/sH8E8SQgJhhi0OkKEoThTrQNCqiFjFqrRq6/riqLWtVmtr+1VxtPxad922VVGsrava2jqrVrFW6wBFUZHhqrIUhEAISc7vD5p8CWELXEye9+uVl+Zy7rnPvbk5zx3n3PAYYwyEEEIIeanxuQ6AEEIIIS+OEjohhBBiACihE0IIIQaAEjohhBBiACihE0IIIQaAEjohhBBiACihE0IIIQaAEjohhBBiACihE0IIMRp5eXlIS0uDTCbjOpQ6RwmdkEZg/PjxsLCw4DqMOhMZGQkej8d1GKSB5OfnY9WqVdr3OTk5WLduHXcBlcIYw+bNm/HKK6/A3NwcVlZWcHd3R0xMDNeh1bkaJfRt27aBx+NpX2KxGN7e3pg+fTrS09PrK0ZCCCGNmJmZGf773/9i586dePDgASIjI/HLL79wHRYAYPTo0Zg6dSp8fHywY8cOHD9+HCdOnMDrr7/OdWh1zqQ2My1atAju7u6Qy+WIjY3Fhg0b8NtvvyEhIQHm5uZ1HSMhhJBGTCAQYOHChRg7dizUajWsrKzw66+/ch0Wtm/fjh9//BExMTEYPXo01+HUu1ol9JCQEHTp0gUAEBERATs7O6xYsQIHDx7EqFGj6jRAQkjjo1QqoVarYWpqynUopJGYNWsW3njjDTx48AA+Pj6wsbHhOiQsXboUo0aNMopkDtTRPfS+ffsCAFJTUwEAT58+xezZs9GuXTtYWFjAysoKISEhiI+P15tXLpcjMjIS3t7eEIvFcHFxweuvv47k5GQAQFpams5l/rKvoKAgbV2nT58Gj8fDjz/+iE8//RTOzs6QSCQYMmQIHjx4oLfsCxcuYODAgbC2toa5uTkCAwNx7ty5ctcxKCio3OVHRkbqlY2JiYGfnx/MzMwglUrx5ptvlrv8ytatNLVajVWrVqFt27YQi8VwcnLClClT8OzZM51ybm5uGDx4sN5ypk+frldnebEvXbpUb5sCQFFRERYsWABPT0+IRCI0a9YMc+bMQVFRUbnbqrSgoCC9+j7//HPw+Xx8//33tdoey5YtQ48ePWBnZwczMzP4+flh79695S4/JiYG/v7+MDc3h62tLXr37o1jx47plDl8+DACAwNhaWkJKysrdO3aVS+2PXv2aD9Te3t7vPXWW3j06JFOmfHjx+vEbGtri6CgIJw9e7bK7aTx6NEjhIWFwcLCAg4ODpg9ezZUKlWN179sLOXtswqFAvPnz4efnx+sra0hkUgQEBCAU6dO6dSl+VyWLVuGVatWwcPDAyKRCDdv3gQAxMbGomvXrhCLxfDw8MCmTZvKXTelUonFixdr53dzc8Onn36qtx9V9L1yc3PD+PHjte+Li4uxcOFCeHl5QSwWw87ODr169cLx48cr3cZlbx2am5ujXbt2+Pbbb2s0X3mvbdu2acvfunULI0aMgFQqhVgsRpcuXfDzzz/r1ZuTk4OZM2fCzc0NIpEITZs2xdixY5GVlaVt0yp7ld5WV69eRUhICKysrGBhYYFXX30Vf/31V63X/+TJkwgICIBEIoGNjQ2GDh2KxMREnTKl+0s0bdoU3bt3h4mJCZydncHj8XD69OlKt6tmfs3L0tIS/v7+OHDggE65oKAg+Pr6VliPZj/VfAYymQwJCQlo1qwZQkNDYWVlBYlEUuF3MiUlBSNHjoRUKoW5uTleeeUVvasMNckxNWn7apKLKlOrM/SyNMnXzs4OQMmGOXDgAEaOHAl3d3ekp6dj06ZNCAwMxM2bN9GkSRMAgEqlwuDBg/H777/jzTffxAcffIC8vDwcP34cCQkJ8PDw0C5j1KhRGDRokM5y586dW248n3/+OXg8Hj7++GNkZGRg1apV6NevH+Li4mBmZgagZEcNCQmBn58fFixYAD6fj61bt6Jv3744e/Ys/P399ept2rQpoqKiAJR0AnnnnXfKXfa8efMQHh6OiIgIZGZmYs2aNejduzeuXr1a7lHr5MmTERAQAADYv38/fvrpJ52/T5kyBdu2bcOECRPw/vvvIzU1FWvXrsXVq1dx7tw5CIXCcrdDTeTk5GjXrTS1Wo0hQ4YgNjYWkydPho+PD65fv46VK1fizp07el+6qmzduhX//e9/sXz58gqPmqvaHqtXr8aQIUMwZswYKBQK/PDDDxg5ciQOHTqE0NBQbbmFCxciMjISPXr0wKJFi2BqaooLFy7g5MmTGDBgAICSxm3ixIlo27Yt5s6dCxsbG1y9ehVHjhzRxqfZ9l27dkVUVBTS09OxevVqnDt3Tu8ztbe3x8qVKwEADx8+xOrVqzFo0CA8ePCgyjMWlUqF4OBgdOvWDcuWLcOJEyewfPlyeHh46Oxr1Vn/KVOmoF+/fjr1HzlyBDt37oSjoyMA4Pnz5/j2228xatQoTJo0CXl5efjuu+8QHByMixcvomPHjnqfnVwux+TJkyESiSCVSnH9+nUMGDAADg4OiIyMhFKpxIIFC+Dk5KS3fhEREYiOjsaIESMwa9YsXLhwAVFRUUhMTNT7jKsjMjISUVFRiIiIgL+/P54/f45Lly7hypUr6N+/f5Xzr1y5Evb29nj+/Dm2bNmCSZMmwc3NTW+7afTu3Rs7duzQvv/8888BAJ999pl2Wo8ePQAAN27cQM+ePeHq6opPPvkEEokEu3fvRlhYGPbt24dhw4YBKGlHAgICkJiYiIkTJ6Jz587IysrCzz//jIcPH2rv+2ps3rwZiYmJ2n0MANq3b69dZkBAAKysrDBnzhwIhUJs2rQJQUFB+OOPP9CtW7carf+JEycQEhKCli1bIjIyEoWFhVizZg169uyJK1euwM3NrcJtu3z58hr3q9KsZ1ZWFtavX4+RI0ciISEBrVq1qlE9GtnZ2QCAL7/8Es7Ozvjoo48gFovxzTffoF+/fjh+/Dh69+4NAEhPT0ePHj1QUFCA999/H3Z2doiOjsaQIUOwd+9e7eelUZ0cU1ZFbV9tclGFWA1s3bqVAWAnTpxgmZmZ7MGDB+yHH35gdnZ2zMzMjD18+JAxxphcLmcqlUpn3tTUVCYSidiiRYu007Zs2cIAsBUrVugtS61Wa+cDwJYuXapXpm3btiwwMFD7/tSpUwwAc3V1Zc+fP9dO3717NwPAVq9era3by8uLBQcHa5fDGGMFBQXM3d2d9e/fX29ZPXr0YL6+vtr3mZmZDABbsGCBdlpaWhoTCATs888/15n3+vXrzMTERG96UlISA8Cio6O10xYsWMBKfyxnz55lANjOnTt15j1y5Ije9BYtWrDQ0FC92N99911W9qMuG/ucOXOYo6Mj8/Pz09mmO3bsYHw+n509e1Zn/o0bNzIA7Ny5c3rLKy0wMFBb36+//spMTEzYrFmzyi1bne3BWMnnVJpCoWC+vr6sb9++OnXx+Xw2bNgwvX1R85nn5OQwS0tL1q1bN1ZYWFhuGYVCwRwdHZmvr69OmUOHDjEAbP78+dpp48aNYy1atNCpZ/PmzQwAu3jxYrnrXHpeADrfD8YY69SpE/Pz86vx+peVlJTErK2tWf/+/ZlSqWSMMaZUKllRUZFOuWfPnjEnJyc2ceJE7TTNd9DKyoplZGTolA8LC2NisZjdu3dPO+3mzZtMIBDofG5xcXEMAIuIiNCZf/bs2QwAO3nypHZa2X1To0WLFmzcuHHa9x06dCh3f6+Kph1LTU3VTrtz5w4DwL766qtq11N63y7r1VdfZe3atWNyuVw7Ta1Wsx49ejAvLy/ttPnz5zMAbP/+/Xp1lG6bNMrbxzTCwsKYqakpS05O1k77559/mKWlJevdu7d2WnXXv2PHjszR0ZFlZ2drp8XHxzM+n8/Gjh2rnVb2O5qRkcEsLS1ZSEgIA8BOnTpVbrwVzc8YY8eOHWMA2O7du7XTAgMDWdu2bSusR7Ofbt26Vee
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 200x200 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Вывод распределения количества наблюдений по меткам (классам)\n",
|
|||
|
"print(df2.hazardous.value_counts())\n",
|
|||
|
"print()\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"data = df2[['est_diameter_min', 'est_diameter_max', 'relative_velocity', 'miss_distance', 'absolute_magnitude', 'hazardous']].copy()\n",
|
|||
|
"\n",
|
|||
|
"df_train, df_val, df_test = split_stratified_into_train_val_test(\n",
|
|||
|
" data, stratify_colname=\"hazardous\", frac_train=0.60, frac_val=0.20, frac_test=0.20\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"print(\"Обучающая выборка: \", df_train.shape)\n",
|
|||
|
"print(df_train.hazardous.value_counts())\n",
|
|||
|
"hazardous_counts = df_train['hazardous'].value_counts()\n",
|
|||
|
"plt.figure(figsize=(2, 2))# Установка размера графика\n",
|
|||
|
"plt.pie(hazardous_counts, labels=hazardous_counts.index, autopct='%1.1f%%', startangle=90)# Построение круговой диаграммы\n",
|
|||
|
"plt.title('Распределение классов hazardous в обучающей выборке')# Добавление заголовка\n",
|
|||
|
"plt.show()# Отображение графика\n",
|
|||
|
"\n",
|
|||
|
"print(\"Контрольная выборка: \", df_val.shape)\n",
|
|||
|
"print(df_val.hazardous.value_counts())\n",
|
|||
|
"hazardous_counts = df_val['hazardous'].value_counts()\n",
|
|||
|
"plt.figure(figsize=(2, 2))\n",
|
|||
|
"plt.pie(hazardous_counts, labels=hazardous_counts.index, autopct='%1.1f%%', startangle=90)\n",
|
|||
|
"plt.title('Распределение классов hazardous в контрольной выборке')\n",
|
|||
|
"plt.show()\n",
|
|||
|
"\n",
|
|||
|
"print(\"Тестовая выборка: \", df_test.shape)\n",
|
|||
|
"print(df_test.hazardous.value_counts())\n",
|
|||
|
"hazardous_counts = df_test['hazardous'].value_counts()\n",
|
|||
|
"plt.figure(figsize=(2, 2))\n",
|
|||
|
"plt.pie(hazardous_counts, labels=hazardous_counts.index, autopct='%1.1f%%', startangle=90)\n",
|
|||
|
"plt.title('Распределение классов hazardous в тестовой выборке')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Соотношение классов сильно смещено, это может привести к проблемам в обучении модели, так как модель будет обучаться в основном на одном классе. Стоит рассмотреть методы аугментации данных."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 54,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Обучающая выборка после oversampling: (100631, 6)\n",
|
|||
|
"hazardous\n",
|
|||
|
"True 51434\n",
|
|||
|
"False 49197\n",
|
|||
|
"Name: count, dtype: int64\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAqwAAADECAYAAABa+nMuAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABAVElEQVR4nO3dd1xTV/8H8E8SQsIUkCGOIoKK4qhSbYsiblyPddVZ6/i5am2rttWqVXGVx9rWPTrVqtW6bbXO2qHWuupeRdSqqCigbAhJzu8PnqSEhClwA3zerxcvzc29537vzbk335x77rkyIYQAEREREZGVkksdABERERFRXpiwEhEREZFVY8JKRERERFaNCSsRERERWTUmrERERERk1ZiwEhEREZFVY8JKRERERFaNCSsRERERWTUmrERERGWcXq9HbGwsbt68KXUoRCWCCSuRFRg6dCgcHR2lDqPYhIeHQyaTSR0G0TM7evQofv31V+PrX3/9FceOHZMuoGwePnyI8ePHw8fHB7a2tvDw8ED9+vWRmJgodWhExa5QCeuaNWsgk8mMf2q1GnXq1MG4ceMQExNTUjESERFJ4u7duxg7diwuXryIixcvYuzYsbh7967UYeHGjRto1qwZNm3ahNGjR2P37t04ePAgfv75Zzg4OEgdHhXCTz/9BJlMhqpVq0Kv11ucp2bNmsbcSy6Xw8XFBQ0bNsSoUaNw4sSJZy5fo9Fg8eLFaNKkCZydneHi4oLAwECMGjUK165dAwB07twZrq6uFvO9hIQEeHt748UXX4Rer8evv/5qjPfMmTNm8xelkcamUHP/z+zZs+Hr64v09HQcPXoUK1euxE8//YRLly7B3t6+KEUSERFZnV69emHRokVo1KgRAODll19Gr169JI4KGD16NGxtbfHnn3+iWrVqUodDz2DDhg2oWbMmbt++jcOHD6N9+/YW53v++efx7rvvAgCSkpJw9epVbNmyBV9++SUmTJiAzz77rMjl9+7dG3v37sWAAQMwcuRIZGZm4tq1a9i9ezeCg4MREBCAFStWoEGDBpgwYQK+++47k+WnTp2K2NhY7Nu3D3K5aVtoeHg4fvzxx6LsGlOiEFavXi0AiFOnTplMnzhxogAgvvvuu8IUR0T/M2TIEOHg4CB1GAWWmZkpMjIycn1/5syZopCnFyKrpdVqxblz58S5c+eEVquVOhxx+vRpAUAcOHBA6lDoGSUnJwsHBwexZMkS0aRJEzF06FCL8/n4+IiuXbuaTU9NTRU9evQQAMSKFSuKVP7JkycFADFv3jyz97RarYiNjTW+nj9/vgAg9u/fb7K8XC4XkyZNMk775ZdfBADx/PPPCwDizJkzJuUW5TuvWPqwtm3bFgBw69YtAEB8fDzee+89NGzYEI6OjnB2dkbnzp1x/vx5s2XT09MRHh6OOnXqQK1Ww9vbG7169UJUVBQA4Pbt2ybdEHL+tW7d2liWoQn6+++/x9SpU1GlShU4ODige/fuFi/hnDhxAp06dUKlSpVgb2+P0NDQXPsmtW7d2uL6w8PDzeZdv349goKCYGdnBzc3N/Tv39/i+vPatuz0ej0WLVqEwMBAqNVqeHl5YfTo0Xjy5InJfDVr1kS3bt3M1jNu3DizMi3FvmDBArN9CgAZGRmYOXMm/P39oVKpUKNGDUyaNAkZGRkW91V2rVu3Nitv3rx5kMvlZr/QCro/PvnkEwQHB6Ny5cqws7NDUFAQtm7danH969evR/PmzWFvbw9XV1e0atUKBw4cMJln7969CA0NhZOTE5ydndGsWTOz2LZs2WL8TN3d3fHaa68hOjraZJ6hQ4eaxOzq6orWrVvjyJEj+e4ng+joaPTo0QOOjo7w8PDAe++9B51OV+jtzxmLpTqr0WgwY8YMBAUFoVKlSnBwcEBISAh++eUXk7IMn8snn3yCRYsWwc/PDyqVCleuXAGQ1cevWbNmUKvV8PPzw+eff25x27RaLebMmWNcvmbNmpg6dapZPcrtuKpZsyaGDh1qfJ2ZmYlZs2ahdu3aUKvVqFy5Mlq2bImDBw/muY9zdm2yt7dHw4YN8dVXXxVqOUt/a9asAfDv5a6bN28iLCwMDg4OqFq1KmbPng0hhEm5Uh7fhT1nFudxsGvXLnTt2hVVq1aFSqWCn58f5syZY1bfLW2L4bO4fft2kfZPQeuioc4pFAo0btwYjRs3xvbt2yGTyVCzZk2zdeWU8zJulSpV0K9fP9y5c8c4T/bjKzc5+4T/+eefUKvViIqKQmBgIFQqFapUqYLRo0cjPj7ebPmCfm4FqbOGeA11Hchq7QsKCoKvry8ePHhgnF7Qum1JXucwmUxm0q+4oNsIANeuXUPfvn3h4eEBOzs71K1bF9OmTTObL/tnl9d69+7di5CQEDg4OMDJyQldu3bF5cuX890+gx07diAtLQ2vvvoq+vfvj+3btyM9Pb3Ay9vZ2WHdunVwc3PDvHnzzM4vBSnfkG+1aNHCrHyFQoHKlSsbX0+cOBGNGjXC2LFjkZ6eDp1OhzFjxsDHxwczZ840W/6tt96Cq6urxXN6YRWpS0BOho01bNTNmzexc+dOvPrqq/D19UVMTAw+//xzhIaG4sqVK6hatSoAQKfToVu3bvj555/Rv39/vPPOO0hKSsLBgwdx6dIl+Pn5GdcxYMAAdOnSxWS9U6ZMsRjPvHnzIJPJMHnyZDx69AiLFi1C+/btce7cOdjZ2QEADh8+jM6dOyMoKAgzZ86EXC7H6tWr0bZtWxw5cgTNmzc3K7d69eqIiIgAACQnJ+ONN96wuO7p06ejb9++GDFiBB4/foylS5eiVatWOHv2LFxcXMyWGTVqFEJCQgAA27dvx44dO0zeHz16NNasWYNhw4bh7bffxq1bt7Bs2TKcPXsWx44dg1KptLgfCuPp06fGbctOr9eje/fuOHr0KEaNGoV69erh4sWLWLhwIf7++2/s3LmzUOtZvXo1PvzwQ3z66acYOHCgxXny2x+LFy9G9+7dMWjQIGg0GmzatAmvvvoqdu/eja5duxrnmzVrFsLDwxEcHIzZs2fD1tYWJ06cwOHDh9GxY0cAWV98w4cPR2BgIKZMmQIXFxecPXsW+/btM8Zn2PfNmjVDREQEYmJisHjxYhw7dszsM3V3d8fChQsBAPfu3cPixYvRpUsX3L171+Jnn51Op0NYWBhefPFFfPLJJzh06BA+/fRT+Pn5mdS1gmz/6NGjzS777Nu3Dxs2bICnpycAIDExEV999ZXxElBSUhK+/vprhIWF4eTJk3j++efNPrv09HSMGjUKKpUKbm5uuHjxIjp27AgPDw+Eh4dDq9Vi5syZ8PLyMtu+ESNGYO3atejTpw/effddnDhxAhEREbh69arZZ1wQ4eHhiIiIwIgRI9C8eXMkJibi9OnT+Ouvv9ChQ4d8l1+4cCHc3d2RmJiIb775BiNHjkTNmjVzvRzXqlUrrFu3zvh63rx5AGDyZRccHGz8v06nQ6dOnfDSSy/h448/xr59+zBz5kxotVrMnj3bOJ+Ux3f2bcnvnFncx8GaNWvg6OiIiRMnwtHREYcPH8aMGTOQmJiIBQsWPPM256WodVGr1VpMbvISEhKCUaNGQa/X49KlS1i0aBHu379fqB+yOcXFxSE9PR1vvPEG2rZtizFjxiAqKgrLly/HiRMncOLECahUKgCF+9wKWmezy8zMRO/evXHnzh0cO3YM3t7exveetW6rVCqzH5KnTp3CkiVLTKYVdBsvXLiAkJAQKJVKjBo1CjVr1kRUVBR+/PFH4/GcneGzA4CrV6/io48+Mnl/3bp1GDJkCMLCwjB//nykpqZi5cqVaNmyJc6ePVugHzUbNmxAmzZtUKVKFfTv3x8ffPABfvzxR7z66qv5Lmvg6OiInj174uuvv8aVK1cQGBhYqPJ9fHyM87Zo0QI2NrmnhjY2Nvjiiy8QHByMOXPmwNPTE3/99Rf27dtnsUuos7MzJkyYgBkzZuCvv/5C06ZNC7xdZgrTHGvoEnDo0CHx+PFjcffuXbFp0yZRuXJlYWdnJ+7duyeEECI9PV3odDqTZW/duiVUKpWYPXu
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 200x200 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from imblearn.over_sampling import ADASYN\n",
|
|||
|
"\n",
|
|||
|
"# Создание экземпляра ADASYN\n",
|
|||
|
"ada = ADASYN()\n",
|
|||
|
"\n",
|
|||
|
"# Применение ADASYN\n",
|
|||
|
"X_resampled, y_resampled = ada.fit_resample(df_train.drop(columns=['hazardous']), df_train['hazardous'])\n",
|
|||
|
"\n",
|
|||
|
"# Создание нового DataFrame\n",
|
|||
|
"df_train_adasyn = pd.DataFrame(X_resampled)\n",
|
|||
|
"df_train_adasyn['hazardous'] = y_resampled # Добавление целевой переменной\n",
|
|||
|
"\n",
|
|||
|
"# Вывод информации о новой выборке\n",
|
|||
|
"print(\"Обучающая выборка после oversampling: \", df_train_adasyn.shape)\n",
|
|||
|
"print(df_train_adasyn['hazardous'].value_counts())\n",
|
|||
|
"hazardous_counts = df_train_adasyn['hazardous'].value_counts()\n",
|
|||
|
"plt.figure(figsize=(2, 2))\n",
|
|||
|
"plt.pie(hazardous_counts, labels=hazardous_counts.index, autopct='%1.1f%%', startangle=90)\n",
|
|||
|
"plt.title('Распределение классов hazardous в тренировачной выборке после ADASYN')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 55,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Обучающая выборка после undersampling: (10608, 6)\n",
|
|||
|
"hazardous\n",
|
|||
|
"False 5304\n",
|
|||
|
"True 5304\n",
|
|||
|
"Name: count, dtype: int64\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAuYAAADECAYAAADTYuRHAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABAH0lEQVR4nO3dd1gUV9sG8Ht3gaWrFClqFEFBxIolsWFHxc8aNUZjiy3G5NXEFDXGEpXXmNhbEmM3mthfNdZYYo8aCzZE7I0iUgQW2N3z/UF2w7JLdXEQ7t917aU7O3PmmTNnZp6dPXOQCSEEiIiIiIhIUnKpAyAiIiIiIibmRERERETFAhNzIiIiIqJigIk5EREREVExwMSciIiIiKgYYGJORERERFQMMDEnIiIiIioGmJgTERERERUDTMyJiIhec1qtFrGxsbh9+7bUoRDRS2BiTlQMDBo0CPb29lKHYTZTpkyBTCaTOgyil3b8+HEcOXJE//7IkSM4ceKEdAFl8fTpU4wZMwaVK1eGlZUVXF1d4e/vj8TERKlDo2Jo0KBBqFKlitRhFAtHjhyBTCYzOLaLS/0UKDFftWoVZDKZ/mVtbY3q1atj9OjRiIqKKqoYiYiIJPHgwQOMGjUKYWFhCAsLw6hRo/DgwQOpw8KtW7fQsGFDbNy4ESNGjMCuXbtw4MAB/PHHH7Czs5M6PMqDLp86d+6cyc87d+5cLJJEevUsCrPQtGnT4OXlBZVKhePHj2Pp0qX4/fffceXKFdja2po7RiIiIkn06NED8+bNQ+3atQEAb731Fnr06CFxVMCIESNgZWWF06dPo0KFClKHQ/Ta++mnn6DVaqUOo3CJeceOHdGgQQMAwNChQ+Hs7Iw5c+Zgx44d6Nu3r1kDJKLiR61WQ6vVwsrKSupQiIqUUqnEyZMnceXKFQBAQEAAFAqFpDGdP38ehw4dwv79+5mUkySEEFCpVLCxsZE6FLOxtLSUOgQAZupj3rp1awDAnTt3AABxcXEYN24catWqBXt7ezg6OqJjx464dOmS0bIqlQpTpkxB9erVYW1tDQ8PD/To0QORkZEAgLt37xp0n8n+atmypb4sXZ+hX3/9FRMmTIC7uzvs7OzQpUsXkz89njlzBh06dECZMmVga2uLoKCgHPsOtmzZ0uT6p0yZYjTvunXrEBgYCBsbGzg5OeGdd94xuf7cti0rrVaLefPmoWbNmrC2toabmxtGjBiB58+fG8xXpUoVdO7c2Wg9o0ePNirTVOyzZ882qlMASEtLw+TJk+Hj4wOlUolKlSrh888/R1pamsm6yqply5ZG5c2YMQNyuRy//PJLoerju+++Q5MmTeDs7AwbGxsEBgZi8+bNJte/bt06NGrUCLa2tihXrhxatGiB/fv3G8yzZ88eBAUFwcHBAY6OjmjYsKFRbJs2bdLvUxcXF/Tv3x+PHj0ymGfQoEEGMZcrVw4tW7bEsWPH8qwnnUePHqFbt26wt7eHq6srxo0bB41GU+Dtzx6LqTabnp6Or7/+GoGBgShTpgzs7OzQvHlzHD582KAs3X757rvvMG/ePHh7e0OpVOLatWsAMvvgNmzYENbW1vD29sYPP/xgctvUajW++eYb/fJVqlTBhAkTjNpRTsdVlSpVMGjQIP37jIwMTJ06FdWqVYO1tTWcnZ3RrFkzHDhwINc6zt4lz9bWFrVq1cLy5csLtJyp16pVqwD8+8zA7du3ERwcDDs7O3h6emLatGkQQhiUK+XxXdBzprmPgyVLlqBmzZpQKpXw9PTEhx9+iPj4+Dy3Rbcv7t69W6j6yW9b1LU5hUKBOnXqoE6dOti6dStkMlm+uhlUqVJFXw9yuRzu7u7o06cP7t+/r58n6/GVk+zPbJw+fRrW1taIjIzU15+7uztGjBiBuLg4o+Xzu9/y02Z18eraOgAkJSUhMDAQXl5eePLkiX56ftu2Kbmdw7L3Dc7vNgLAjRs30Lt3b7i6usLGxga+vr6YOHGi0XxZ911u692zZw+aN28OOzs7ODg4ICQkBFevXs1z+woqazv58ccf9W23YcOGOHv2rNH827dvR0BAAKytrREQEIBt27aZLLeg5599+/ahQYMGsLGx0Z/rDxw4gGbNmqFs2bKwt7eHr68vJkyYoF+2MNeaxYsXo2rVqrC1tUX79u3x4MEDCCHwzTffoGLFirCxsUHXrl2N2rsuzv3796Nu3bqwtraGv78/tm7dmmcdZ+9jXtA637RpE/z9/Q3qvDD91gt1xzw7XRLt7OwMALh9+za2b9+OXr16wcvLC1FRUfjhhx8QFBSEa9euwdPTEwCg0WjQuXNn/PHHH3jnnXfwn//8B0lJSThw4ACuXLkCb29v/Tr69u2LTp06Gax3/PjxJuOZMWMGZDIZvvjiC0RHR2PevHlo27YtLl68qP92d+jQIXTs2BGBgYGYPHky5HI5Vq5cidatW+PYsWNo1KiRUbkVK1ZEaGgoAODFixf44IMPTK570qRJ6N27N4YOHYqYmBgsXLgQLVq0wIULF1C2bFmjZYYPH47mzZsDALZu3Wp0AI0YMQKrVq3C4MGD8fHHH+POnTtYtGgRLly4gBMnTpjlW158fLx+27LSarXo0qULjh8/juHDh6NGjRoICwvD3LlzcfPmTWzfvr1A61m5ciW++uorfP/993j33XdNzpNXfcyfPx9dunRBv379kJ6ejo0bN6JXr17YtWsXQkJC9PNNnToVU6ZMQZMmTTBt2jRYWVnhzJkzOHToENq3bw8g8wI/ZMgQ1KxZE+PHj0fZsmVx4cIF7N27Vx+fru4bNmyI0NBQREVFYf78+Thx4oTRPnVxccHcuXMBAA8fPsT8+fPRqVMnPHjwwOS+z0qj0SA4OBiNGzfGd999h4MHD+L777+Ht7e3QVvLz/aPGDECbdu2NSh/7969WL9+PcqXLw8ASExMxPLly9G3b18MGzYMSUlJ+PnnnxEcHIy//voLdevWNdp3KpUKw4cPh1KphJOTE8LCwtC+fXu4urpiypQpUKvVmDx5Mtzc3Iy2b+jQoVi9ejXefvttfPrppzhz5gxCQ0Nx/fr1HC8auZkyZQpCQ0MxdOhQNGrUCImJiTh37hz+/vtvtGvXLs/l586dCxcXFyQmJmLFihUYNmwYqlSpYlRvOi1atMDatWv172fMmAEABhf1Jk2a6P+v0WjQoUMHvPnmm/j222+xd+9eTJ48GWq1GtOmTdPPJ+XxnXVb8jpnmvs4mDJlCqZOnYq2bdvigw8+QHh4OJYuXYqzZ8+abbtzUti2qFarTSZxuWnevDmGDx8OrVaLK1euYN68eXj8+HGBvrBn9+zZM6hUKnzwwQdo3bo1Ro4cicjISCxevBhnzpzBmTNnoFQqARRsv+W3zWaVkZGBnj174v79+zhx4gQ8PDz0n71s21YqlUZfmM+ePYsFCxYYTMvvNl6+fBnNmzeHpaUlhg8fjipVqiAyMhI7d+7UH89Z6fYdAFy/fh0zZ840+Hzt2rUYOHAggoODMWvWLKSkpGDp0qVo1qwZLly4UCR9xH/55RckJSVhxIgRkMlk+Pbbb9GjRw/cvn1bX5/79+9Hz5494e/vj9DQUDx79gyDBw9GxYoVjcoryD4KDw9H3759MWLECAwbNgy+vr64evUqOnfujNq1a2PatGlQKpW4deuWwU3Ogl5r1q9fj/T0dHz00UeIi4vDt99+i969e6N169Y4cuQIvvjiC9y6dQsLFy7EuHHjsGLFCoPlIyIi0KdPH4wcORIDBw7EypUr0atXL+zduzdf14bC1Pnu3bvRp08f1KpVC6GhoXj+/Dnef//9wv2iJQpg5cqVAoA4ePCgiImJEQ8ePBAbN24Uzs7OwsbGRjx8+FAIIYRKpRIajcZg2Tt37gilUimmTZumn7ZixQoBQMyZM8doXVqtVr8cADF79myjeWrWrCmCgoL07w8fPiwAiAoVKojExET99N9++00AEPPnz9eXXa1aNREcHKxfjxBCpKSkCC8vL9GuXTujdTVp0kQEBATo38fExAgAYvLkyfppd+/eFQqFQsyYMcNg2bCwMGFhYWE0PSIiQgAQq1e
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 200x200 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"#андерсемплинг\n",
|
|||
|
"rus = RandomUnderSampler()# Создание экземпляра RandomUnderSampler\n",
|
|||
|
"\n",
|
|||
|
"# Применение RandomUnderSampler\n",
|
|||
|
"X_resampled, y_resampled = rus.fit_resample(df_train.drop(columns=['hazardous']), df_train['hazardous'])\n",
|
|||
|
"\n",
|
|||
|
"# Создание нового DataFrame\n",
|
|||
|
"df_train_undersampled = pd.DataFrame(X_resampled)\n",
|
|||
|
"df_train_undersampled['hazardous'] = y_resampled # Добавление целевой переменной\n",
|
|||
|
"\n",
|
|||
|
"# Вывод информации о новой выборке\n",
|
|||
|
"print(\"Обучающая выборка после undersampling: \", df_train_undersampled.shape)\n",
|
|||
|
"print(df_train_undersampled['hazardous'].value_counts())\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация распределения классов\n",
|
|||
|
"hazardous_counts = df_train_undersampled['hazardous'].value_counts()\n",
|
|||
|
"plt.figure(figsize=(2, 2))\n",
|
|||
|
"plt.pie(hazardous_counts, labels=hazardous_counts.index, autopct='%1.1f%%', startangle=90)\n",
|
|||
|
"plt.title('Распределение классов hazardous в тренировочной выборке после Undersampling')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Заработная плата рабочих мест в области Data Science\thttps://www.kaggle.com/datasets/henryshan/2023-data-scientists-salary"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Целью этого исследования является изучение факторов, влияющих на заработную плату специалистов по обработке данных. Для этого был использован набор данных, содержащий различные релевантные переменные. В этом отчёте описывается исследовательский анализ, проведённый для понимания взаимосвязи между этими факторами и заработной платой специалистов по обработке данных."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 62,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Index(['work_year', 'experience_level', 'employment_type', 'job_title',\n",
|
|||
|
" 'salary', 'salary_currency', 'salary_in_usd', 'employee_residence',\n",
|
|||
|
" 'remote_ratio', 'company_location', 'company_size'],\n",
|
|||
|
" dtype='object')\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"df3 = pd.read_csv(\"..//static//csv//ds_salaries.csv\")\n",
|
|||
|
"\n",
|
|||
|
"print(df3.columns)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 63,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAx8AAAIjCAYAAABia6bHAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA7kklEQVR4nO3dd5hV1b0//s8MMAMIMwOIFKWIYgFB7EGsERsYSxRLiIKxxKhRE0tiQEFjvhpbEonGGK+i6JWrXlssESIiSQRUFBGwBiyRJjoUpTP794e/OZdDGQbUNYO8Xs8zD5y911nrs/dZz5nznrNLQZZlWQAAAHzDCmu6AAAAYPMgfAAAAEkIHwAAQBLCBwAAkITwAQAAJCF8AAAASQgfAABAEsIHAACQhPABAAAkIXwAfEXvv/9+FBQUxNChQ2u6lE3e0KFDo6CgIN5///2aLmWtBg8eHAUFBTVdxlfSvn376N+/f02XAWymhA+g1qj84Lnqz1ZbbRUHH3xwPPPMM8nrGT16dF4t9erViw4dOsRpp50W06ZN+1rGePHFF2Pw4MExb968r6U/AKjNhA+g1rn66qtj2LBhce+998Zll10Wn3zySfTq1SuefPLJGqnnggsuiGHDhsUdd9wRvXv3jv/5n/+JvfbaK2bMmPGV+37xxRfjqquuEj42EQMHDozFixfXdBkAm6y6NV0AwOqOPPLI2HPPPXOPzzjjjGjRokU88MADcdRRRyWvZ//9948TTjghIiJOP/302GGHHeKCCy6Ie+65Jy6//PLk9VB9ixYtioYNG35t/dWtWzfq1vWrE2Bj+eYDqPXKysqiQYMGa3zo++KLL+Liiy+ONm3aRHFxcey4445x4403RpZlERGxePHi2GmnnWKnnXbK+2v1Z599Fq1atYp99903Vq5cucH1fPe7342IiOnTp1fZbtSoUbH//vvHFltsEWVlZXHMMcfEm2++mVs/ePDguPTSSyMiYtttt80d3lWd8x1WPzyt8mf06NFrtO3fv/9a2w4ePDiv3cMPPxx77rlnNG7cOK/djTfeWGUty5cvj6uuuio6duwY9evXj2bNmsV+++0XI0eOzLWZNGlS9O/fPzp06BD169ePli1bxo9+9KP49NNP17utjz/+ePTu3Ttat24dxcXFsd1228Wvf/3rNV67gw46KHbZZZeYMGFCHHDAAdGwYcP41a9+Ff369Ystt9wyli9fvkbfhx12WOy4447rraHS2s75KCgoiPPPPz8ee+yx2GWXXaK4uDg6d+4cf/vb36rdb8T/Hea3+mu4tnOKZs2aFaeffnpss802UVxcHK1atYpjjjkmb+5kWRbXXHNNbLPNNtGwYcM4+OCDY8qUKRtUE8DXzZ9vgFpn/vz5MXfu3MiyLObMmRNDhgyJzz//PH74wx/m2mRZFkcffXQ8//zzccYZZ0S3bt3i2WefjUsvvTQ+/vjj+N3vfhcNGjSIe+65J3r06BEDBgyIm2++OSIizjvvvJg/f34MHTo06tSps8H1/fvf/46IiGbNmq2zzd///vc48sgjo0OHDjF48OBYvHhxDBkyJHr06BGvvvpqtG/fPr7//e/HO++8Ew888ED87ne/iy233DIiIpo3b16tOg499NA47bTTIiLi5ZdfjltuuWWdbbfccsv43e9+l3t86qmn5q0fO3ZsnHjiibHrrrvGddddF6WlpTF37tz42c9+tt46Bg8eHNdee22ceeaZsffee8eCBQvilVdeiVdffTUOPfTQiIgYOXJkTJs2LU4//fRo2bJlTJkyJe64446YMmVKjBs3rsqTuIcOHRqNGjWKn//859GoUaMYNWpUXHnllbFgwYK44YYb8tp++umnceSRR8bJJ58cP/zhD6NFixaxxRZbxL333hvPPvts3jdns2bNilGjRsWgQYPWu43r889//jMeeeSROPfcc6Nx48Zxyy23xPHHHx8ffvhhlfNkYx1//PExZcqU+OlPfxrt27ePOXPmxMiRI+PDDz+M9u3bR0TElVdeGddcc0306tUrevXqFa+++mocdthhsWzZsq+9HoBqywBqibvvvjuLiDV+iouLs6FDh+a1feyxx7KIyK655pq85SeccEJWUFCQvffee7lll19+eVZYWJiNGTMme+ihh7KIyH7/+9+vt57nn38+i4jsrrvuyj755JNsxowZ2VNPPZW1b98+KygoyF5++eUsy7Js+vTpWURkd999d+653bp1y7baaqvs008/zS17/fXXs8LCwuy0007LLbvhhhuyiMimT59e7f20bNmyLCKy888/P7escruef/75Ndr37ds323bbbfOWRUQ2aNCg3OPLL788i4hs5syZuWWV23XDDTdUWc+uu+6a9e7du8o2ixYtWmPZAw88kEVENmbMmNyyyjmw6v5Y23N//OMfZw0bNsyWLFmSW3bggQdmEZHdfvvteW1XrlyZbbPNNtlJJ52Ut/zmm2/OCgoKsmnTplVZ+6oGDRqUrf6rMyKyoqKivDn3+uuvZxGRDRkypNp9V8631V/D1edXeXn5el+XOXPmZEVFRVnv3r2zioqK3PJf/epXWURk/fr1q3ZdAF8nh10Btc6tt94aI0eOjJEjR8Z9990XBx98cJx55pnxyCOP5No8/fTTUadOnbjgggvynnvxxRdHlmV5V8caPHhwdO7cOfr16xfnnntuHHjggWs8ryo/+tGPonnz5tG6devo3bt3fPHFF3HPPffknZeyqpkzZ8bEiROjf//+0bRp09zyrl27xqGHHhpPP/10tcdemyVLlkRERP369avVftmyZVFcXFxlm4ULF0ZhYWGUlZVtcD1lZWUxZcqUePfdd9fZpkGDBrn/L1myJObOnRvf+c53IiLi1VdfrbL/VZ+7cOHCmDt3buy///6xaNGieOutt/LaFhcXx+mnn563rLCwMPr27RtPPPFELFy4MLf8/vvvj3333Te23Xbb9W/kevTs2TO222673OOuXbtGSUnJ13ZVtFU1aNAgioqKYvTo0VFeXr7WNn//+99j2bJl8dOf/jTvW6WLLrroa68HYEMIH0Cts/fee0fPnj2jZ8+e0bdv33jqqaeiU6dOcf755+cOGfnggw+idevW0bhx47zn7rzzzrn1lYqKiuKuu+6K6dOnx8KFC+Puu+/eoHs1XHnllTFy5MgYNWpUTJo0KWbMmLHGYUurqhx7becS7LzzzjF37tz44osvqj3+6ubOnRsREaWlpdVqP2/evGjUqFGVbbp37x4VFRVx4YUXxr///e+YO3fuOj/Yru7qq6+OefPmxQ477BBdunSJSy+9NCZNmpTX5rPPPosLL7wwWrRoEQ0aNIjmzZvnPvTPnz+/yv6nTJkSxx13XJSWlkZJSUk0b948dwje6s/deuuto6ioaI0+TjvttFi8eHE8+uijERHx9ttvx4QJE6p8HTdE27Zt11jWpEmTau/DDVFcXBy//e1v45lnnokWLVrEAQccENdff33MmjUr16ZyDnbs2DHvuc2bN48mTZp87TUBVJfwAdR6hYWFcfDBB8fMmTOr/Ot6VZ599tmI+PKv7hvaR5cuXaJnz55x8MEHR5cuXWr8akeVJxVXHtu/PrNmzYqWLVtW2ebkk0+Oiy++OIYOHRrbb799NG/ePHbfffdq9X/AAQfEv//977jrrrtil112iTvvvDN23333uPPOO3NtTjzxxPjLX/4S55xzTjzyyCMxYsSI3AnZFRUV6+x73rx5ceCBB8brr78eV199dfz1r3+NkSNHxm9/+9u1PnfVb0lW1alTp9hjjz3ivvvui4iI++67L4qKiuLEE0+s1jauz7rOHcr+/4sfVMe6AvHaLopw0UUXxTvvvBPXXntt1K9fP6644orYeeed47XXXqv2eAA1QfgANgkrVqyIiIjPP/88IiLatWsXM2bMyDuMJiJyh+G0a9cut2zSpElx9dVXx+mnnx677bZbnHnmmev9a/tXUTn222+/vca6t956K7bccsvYYostImLdHzir8sorr0RErPOwr1UtX7483nvvvdw3QutSWFgYN954Yxx00EHRsWPH3CFv1dW0adM4/fT
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Визуализация данных - ящик с усами. Выборка относительно сбалансирована, есть среднее смещение в среднюю сторону, медиана уравновешена\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"sns.boxplot(x=df3[\"salary_in_usd\"])\n",
|
|||
|
"plt.title(\"Box Plot для salary_in_usd\")\n",
|
|||
|
"plt.xlabel(\"salary_in_usd\")\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 66,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA0sAAAIjCAYAAADSlID1AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABwG0lEQVR4nO3deXhU1f3H8c9MkpnsGxASICRBNsMii6AoixUsigtuFa2iWEW0WsTWDZeibRXFautScavi0l8FFyhWRFBxR3ABWYLIFjYhAbLvycz5/ZFmzJCbZDJMmADv1/PMo7lz7jnfe+bO8uHO3GszxhgBAAAAALzYg10AAAAAALRFhCUAAAAAsEBYAgAAAAALhCUAAAAAsEBYAgAAAAALhCUAAAAAsEBYAgAAAAALhCUAAAAAsEBYAgAAAAALhCUAaMR9990nm80W7DK8nHbaaTrttNOCXUaLfPzxx7LZbPr444+DXcoRzWaz6b777gt2GQGVnp6uSZMmBbsMSW2rFgBtB2EJAAAAACyEBrsAAIDvlixZEuwSECTl5eUKDeVtGwAOJ44sAcBhYIxReXn5IffjcDjkcDgCUBGOBG63WxUVFZKk8PBwwhIAHGaEJQBHleLiYk2bNk3p6elyOp1KSkrSGWecoe+++87T5rPPPtOvfvUrde3aVU6nU6mpqbrlllt8CjMvvfSSTj/9dCUlJcnpdCozM1OzZ89u0C49PV3nnHOO3n//fZ144omKiIjQs88+q1GjRumEE06w7LtXr14aO3Zsk+Mf/Julut8DzZs3Tw888IC6dOmi8PBwjR49Wps3b252eyZNmqT09PQGy61+r7V06VINHz5c8fHxio6OVq9evXTXXXd5tdm1a5fOP/98RUVFKSkpSbfccosqKyubrePNN9+UzWbTJ5980uC+Z599VjabTevWrZMk7d27V1dffbW6dOkip9OplJQUjR8/XtnZ2c2O88MPP+jiiy9WYmKiwsPDdeKJJ2rhwoWe+3Nzc9WhQweddtppMsZ4lm/evFlRUVGaMGGCZ9lpp52mvn376ttvv9Upp5yiiIgIZWRk6JlnnmkwbmVlpWbMmKHu3bt79rnbb7+9wdzYbDbddNNN+te//qU+ffrI6XRq8eLFnvsO/s3S7t279Zvf/EYdO3aU0+lUnz599OKLL3q1aek+smLFCo0bN04JCQmKiopS//799fjjj7doHg9FQUGBpk2bptTUVDmdTnXv3l0PP/yw3G63JKm6ulqJiYm6+uqrG6xbVFSk8PBw3XrrrZ5lvs49AFjhn6gAHFWuv/56vfnmm7rpppuUmZmpAwcO6PPPP9eGDRs0aNAgSdIbb7yhsrIy3XDDDWrXrp1WrlypJ598Urt27dIbb7zRZP+zZ89Wnz59dN555yk0NFTvvPOOfvvb38rtduvGG2/0artx40ZddtllmjJliiZPnqxevXopOjpakydP1rp169S3b19P26+//lo//vij7rnnHr+2+6GHHpLdbtett96qwsJCzZo1S5dffrlWrFjhV38HW79+vc455xz1799ff/rTn+R0OrV582Z98cUXnjbl5eUaPXq0duzYoalTp6pTp0569dVX9dFHHzXb/9lnn63o6GjNmzdPo0aN8rpv7ty56tOnj2e+LrroIq1fv16/+93vlJ6ertzcXC1dulQ7duywDH71t+HUU09V586ddeeddyoqKkrz5s3T+eefr7feeksXXHCBkpKSNHv2bP3qV7/Sk08+qalTp8rtdmvSpEmKiYnR008/7dVnfn6+xo0bp0suuUSXXXaZ5s2bpxtuuEEOh0O/+c1vJNUeHTrvvPP0+eef67rrrtPxxx+vtWvX6m9/+5t+/PFHLViwwKvPjz76SPPmzdNNN92k9u3bN7pNOTk5Ovnkkz0Bq0OHDnrvvfd0zTXXqKioSNOmTfNq78s+snTpUp1zzjlKSUnRzTffrOTkZG3YsEH//e9/dfPNN/s8j/4qKyvTqFGjtHv3bk2ZMkVdu3bVl19+qenTp2vPnj36+9//rrCwMF1wwQV6++239eyzz3odaV2wYIEqKyt16aWX+jX3ANCAAYCjSFxcnLnxxhubbFNWVtZg2cyZM43NZjPbt2/3LJsxY4Y5+GXSat2xY8eabt26eS1LS0szkszixYu9lhcUFJjw8HBzxx13eC2fOnWqiYqKMiUlJU3WPmrUKDNq1CjP38uWLTOSzPHHH28qKys9yx9//HEjyaxdu7bJ/q666iqTlpbWYPnB2/63v/3NSDL79u1rtK+///3vRpKZN2+eZ1lpaanp3r27kWSWLVvWZC2XXXaZSUpKMjU1NZ5le/bsMXa73fzpT38yxhiTn59vJJlHHnmkyb6sjB492vTr189UVFR4lrndbnPKKaeYHj16NKglMjLS/Pjjj+aRRx4xksyCBQu82owaNcpIMo8++qhnWWVlpRkwYIBJSkoyVVVVxhhjXn31VWO3281nn33mtf4zzzxjJJkvvvjCs0ySsdvtZv369Q3ql2RmzJjh+fuaa64xKSkpZv/+/V7tLr30UhMXF+fZV33dR2pqakxGRoZJS0sz+fn5Xn263W6/5rE5aWlp5qqrrvL8/ec//9lERUWZH3/80avdnXfeaUJCQsyOHTuMMca8//77RpJ55513vNqNGzfO67nYkrk/uBYAMMYYvoYH4KgSHx+vFStW6Keffmq0TUREhOf/S0tLtX//fp1yyikyxmjVqlVN9l9/3cLCQu3fv1+jRo3S1q1bVVhY6NU2IyOjwdfq4uLiNH78eP373//2fM3L5XJp7ty5nq+v+ePqq6/2+hf2ESNGSJK2bt3qV38Hi4+PlyT95z//8Xwd6mCLFi1SSkqKLr74Ys+yyMhIXXfddT6NMWHCBOXm5nqdYvzNN9+U2+32fP0tIiJCDodDH3/8sfLz832uPy8vTx999JEuueQSFRcXa//+/dq/f78OHDigsWPHatOmTdq9e7en/VNPPaW4uDhdfPHFuvfeezVx4kSNHz++Qb+hoaGaMmWK52+Hw6EpU6YoNzdX3377raTaI5nHH3+8evfu7Rl3//79Ov300yVJy5Yt8+pz1KhRyszMbHJ7jDF66623dO6558oY49Xv2LFjVVhY6PXVU6n5fWTVqlXatm2bpk2b5nm869R9JbOl89hSb7zxhkaMGKGEhASvbRozZoxcLpc+/fRTSdLpp5+u9u3ba+7cuZ518/PztXTpUq+vSrZ07gHgYIQlAEeVWbNmad26dUpNTdXQoUN13333NQgMO3bs0KRJk5SYmKjo6Gh16NDB89WvgwPPwb744guNGTNGUVFRio+PV4cOHTy/27EKS1auvPJK7dixQ5999pkk6YMPPlBOTo4mTpzo1zZLUteuXb3+TkhIkKQWBYqmTJgwQaeeeqquvfZadezYUZdeeqnmzZvnFZy2b9+u7t27N/itU69evXwa48wzz1RcXJzXB+C5c+dqwIAB6tmzpyTJ6XTq4Ycf1nvvvaeOHTtq5MiRmjVrlvbu3dtk35s3b5YxRvfee686dOjgdZsxY4ak2t8r1UlMTNQTTzyhNWvWKC4uTk888YRlv506dWoQcOtqrfsN1aZNm7R+/foG49a1qz+u1Ph+U9++fftUUFCg5557rkG/db/lObjf5vaRLVu2SJLX10MP1tJ5bKlNmzZp8eLFDfoeM2aMV9+hoaG66KKL9J///Mfz26O3335b1dXVXmGppXMPAAfjN0sAjiqXXHKJRowYofnz52vJkiV65JFH9PDDD+vtt9/WWWedJZfLpTPOOEN5eXm644471Lt3b0VFRWn37t2aNGlSo0dNpNoPk6NHj1bv3r312GOPKTU1VQ6HQ4sWLdLf/va3BuvWPwpV39ixY9WxY0e99tprGjlypF577TUlJyd7PhD6IyQkxHK5qXeSAiuNXXTX5XJ5/R0REaFPP/1Uy5Yt07vvvqvFixdr7ty5Ov3007VkyZJGx28Jp9Op888/X/Pnz9fTTz+tnJwcffHFF3rwwQe92k2bNk3nnnuuFixYoPfff1/33nuvZs6cqY8++kgDBw607Lvusbn11ls
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Визуализируем отношение размера компании и зарплаты\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"plt.scatter(df3[\"salary_in_usd\"], df3[\"experience_level\"])\n",
|
|||
|
"plt.xlabel(\"salary_in_usd\")\n",
|
|||
|
"plt.ylabel(\"experience_level\")\n",
|
|||
|
"plt.title(\"salary in usd vs experience_level\")\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 70,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA1MAAAIjCAYAAADm7UHpAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAB+ZElEQVR4nO3deXwU9f3H8ffuJrub+w4JEJJwGw4RBAE5rIAg3iceKFRFa7UerXe1YlvFo5faitevYrWtIFWLVVFUvAABBUQOOQPhTMhB7mST3fn9EbNmc+5OFhPg9Xw88oDMfuc7n+8xk/ns7M5YDMMwBAAAAAAIiLWjAwAAAACAoxHJFAAAAACYQDIFAAAAACaQTAEAAACACSRTAAAAAGACyRQAAAAAmEAyBQAAAAAmkEwBAAAAgAkkUwAAAABgAskUALRg9uzZslgsHR2Gj9NOO02nnXZaR4cRkE8++UQWi0WffPJJR4dyVLNYLJo9e3ZHhxFUGRkZmjlzZkeHIalzxQLg6EEyBQAAAAAmhHR0AAAA/33wwQcdHQI6SGVlpUJC+LMNAJ0JV6YA4EdgGIYqKyvbXY/dbpfdbg9CRDgaeDweVVVVSZKcTifJFAB0MiRTAI4ppaWluu2225SRkSGHw6Hk5GRNmjRJa9as8Zb5/PPPdckll6hHjx5yOBxKS0vT7bff7ley89JLL+n0009XcnKyHA6HsrKyNHfu3CblMjIydPbZZ+v999/XySefrLCwMD333HMaP368TjzxxGbr7tevnyZPntzq9ht/Z6r++0gLFizQww8/rO7du8vpdGrChAnavn17m+2ZOXOmMjIymixv7vtiS5Ys0ZgxYxQbG6vIyEj169dP9913n0+ZvXv36vzzz1dERISSk5N1++23q7q6us04Fi5cKIvFok8//bTJa88995wsFos2bNggSTp48KB++tOfqnv37nI4HEpNTdV5552nXbt2tbmd7777ThdffLHi4+PldDp18skna9GiRd7X8/LylJSUpNNOO02GYXiXb9++XREREZo2bZp32WmnnaaBAwfq66+/1ujRoxUWFqbMzEw9++yzTbZbXV2tBx98UL179/bOubvuuqtJ31gsFt1888365z//qQEDBsjhcGjx4sXe1xp/Z2rfvn265ppr1KVLFzkcDg0YMEB///vffcoEOkdWrlypqVOnKi4uThERERo8eLCefPLJgPqxPQ4fPqzbbrtNaWlpcjgc6t27tx577DF5PB5JUk1NjeLj4/XTn/60ybolJSVyOp264447vMv87XsAMIO3uAAcU372s59p4cKFuvnmm5WVlaWCggJ98cUX2rx5s4YOHSpJev3111VRUaEbb7xRCQkJWrVqlZ5++mnt3btXr7/+eqv1z507VwMGDNC5556rkJAQvf322/r5z38uj8ejm266yafsli1bdPnll+uGG27QrFmz1K9fP0VGRmrWrFnasGGDBg4c6C27evVqbd26Vffff7+pdj/66KOyWq264447VFxcrMcff1xXXnmlVq5caaq+xjZu3Kizzz5bgwcP1m9/+1s5HA5t375dy5Yt85aprKzUhAkTlJOTo1tuuUVdu3bVK6+8oo8//rjN+s866yxFRkZqwYIFGj9+vM9r8+fP14ABA7z9ddFFF2njxo36xS9+oYyMDOXl5WnJkiXKyclpNjFs2IZTTz1V3bp10z333KOIiAgtWLBA559/vv7zn//oggsuUHJysubOnatLLrlETz/9tG655RZ5PB7NnDlTUVFReuaZZ3zqLCoq0tSpU3XppZfq8ssv14IFC3TjjTfKbrfrmmuukVR3dencc8/VF198oeuvv14nnHCCvv32W/35z3/W1q1b9dZbb/nU+fHHH2vBggW6+eablZiY2GKbcnNzNXLkSG8ClpSUpPfee0/XXnutSkpKdNttt/mU92eOLFmyRGeffbZSU1N16623KiUlRZs3b9b//vc/3XrrrX73o1kVFRUaP3689u3bpxtuuEE9evTQ8uXLde+99+rAgQP6y1/+otDQUF1wwQV644039Nxzz/lcqX3rrbdUXV2tyy67zFTfA0DADAA4hsTExBg33XRTq2UqKiqaLJszZ45hsViM3bt3e5c9+OCDRuPDZHPrTp482ejZs6fPsvT0dEOSsXjxYp/lhw8fNpxOp3H33Xf7LL/llluMiIgIo6ysrNXYx48fb4wfP977+9KlSw1JxgknnGBUV1d7lz/55JOGJOPbb79ttb4ZM2YY6enpTZY3bvuf//xnQ5Jx6NChFuv6y1/+YkgyFixY4F1WXl5u9O7d25BkLF26tNVYLr/8ciM5Odmora31Ljtw4IBhtVqN3/72t4ZhGEZRUZEhyXjiiSdaras5EyZMMAYNGmRUVVV5l3k8HmP06NFGnz59msQSHh5ubN261XjiiScMScZbb73lU2b8+PGGJOOPf/yjd1l1dbUxZMgQIzk52XC5XIZhGMYrr7xiWK1W4/PPP/dZ/9lnnzUkGcuWLfMuk2RYrVZj48aNTeKXZDz44IPe36+99lojNTXVyM/P9yl32WWXGTExMd656u8cqa2tNTIzM4309HSjqKjIp06Px2OqH9uSnp5uzJgxw/v77373OyMiIsLYunWrT7l77rnHsNlsRk5OjmEYhvH+++8bkoy3337bp9zUqVN99sVA+r5xLADgDz7mB+CYEhsbq5UrV2r//v0tlgkLC/P+v7y8XPn5+Ro9erQMw9DatWtbrb/husXFxcrPz9f48eO1c+dOFRcX+5TNzMxs8rG9mJgYnXfeefr3v//t/RiZ2+3W/PnzvR+PM+OnP/2pzzv0Y8eOlSTt3LnTVH2NxcbGSpL++9//ej9u1di7776r1NRUXXzxxd5l4eHhuv766/3axrRp05SXl+dzC/WFCxfK4/F4P14XFhYmu92uTz75REVFRX7HX1hYqI8//liXXnqpSktLlZ+fr/z8fBUUFGjy5Mnatm2b9u3b5y3/17/+VTExMbr44ov1wAMP6KqrrtJ5553XpN6QkBDdcMMN3t/tdrtuuOEG5eXl6euvv5ZUdyX0hBNOUP/+/b3bzc/P1+mnny5JWrp0qU+d48ePV1ZWVqvtMQxD//nPf3TOOefIMAyfeidPnqzi4mKfj7ZKbc+RtWvXKjs7W7fddpt3vOvVf+Qz0H4M1Ouvv66xY8cqLi7Op00TJ06U2+3WZ599Jkk6/fTTlZiYqPnz53vXLSoq0pIlS3w+ihlo3wNAoEimABxTHn/8cW3YsEFpaWkaMWKEZs+e3SShyMnJ0cyZMxUfH6/IyEglJSV5P1rWOCFqbNmyZZo4caIiIiIUGxurpKQk7/eGmkummnP11VcrJydHn3/+uSTpww8/VG5urq666ipTbZakHj16+PweFxcnSQElHK2ZNm2aTj31VF133XXq0qWLLrvsMi1YsMAnsdq9e7d69+7d5LtW/fr182sbU6ZMUUxMjM8J8vz58zVkyBD17dtXkuRwOPTYY4/pvffeU5cuXTRu3Dg9/vjjOnjwYKt1b9++XYZh6IEHHlBSUpLPz4MPPiip7vtS9eLj4/XUU09p/fr1iomJ0VNPPdVsvV27dm2SANfHWv8drm3btmnjxo1NtltfruF2pZbnTUOHDh3S4cOH9fzzzzept/67RI3rbWuO7NixQ5J8Pn7aWKD9GKht27Zp8eLFTeqeOHGiT90hISG66KKL9N///tf73ac33nhDNTU1PslUoH0PAIHiO1MAjimXXnqpxo4dqzfffFMffPCBnnjiCT322GN64403dOaZZ8rtdmvSpEkqLCzU3Xffrf79+ysiIkL79u3TzJkzW7zqItWdbE6YMEH9+/fXn/70J6Wlpclut+vdd9/Vn//85ybrNryK1dDkyZPVpUsXvfrqqxo3bpxeffVVpaSkeE8YzbDZbM0uNxrcRKE5LT2U2O12+/weFhamzz77TEuXLtU777yjxYsXa/78+Tr99NP1wQcftLj9QDgcDp1//vl688039cwzzyg3N1fLli3TI4884lPutttu0znnnKO33npL77//vh544AHNmTNHH3/8sU466aR
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"\n",
|
|||
|
"# Статистический анализ для определения выбросов\n",
|
|||
|
"Q1 = df3[\"salary_in_usd\"].quantile(0.25)\n",
|
|||
|
"Q3 = df3[\"salary_in_usd\"].quantile(0.75)\n",
|
|||
|
"IQR = Q3 - Q1\n",
|
|||
|
"\n",
|
|||
|
"# Определение порога для выбросов\n",
|
|||
|
"threshold = 1.5 * IQR\n",
|
|||
|
"outliers = (df3[\"salary_in_usd\"] < (Q1 - threshold)) | (\n",
|
|||
|
" df3[\"salary_in_usd\"] > (Q3 + threshold)\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"# Обработка выбросов\n",
|
|||
|
"# В данном случае мы уберем выбросы\n",
|
|||
|
"median_salary = df3[\"salary_in_usd\"].median()\n",
|
|||
|
"df3.loc[outliers, \"salary_in_usd\"] = 0\n",
|
|||
|
"df3 = df3[df3.salary_in_usd != 0]\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация данных после обработки\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"plt.scatter(df3[\"salary_in_usd\"], df3[\"experience_level\"])\n",
|
|||
|
"plt.xlabel(\"salary_in_usd\")\n",
|
|||
|
"plt.ylabel(\"experience_level\")\n",
|
|||
|
"plt.title(\"salary in usd vs experience_level\")\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 71,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки: 2214\n",
|
|||
|
"Размер контрольной выборки: 739\n",
|
|||
|
"Размер тестовой выборки: 739\n",
|
|||
|
"Распределение salary_in_usd в обучающей выборке:\n",
|
|||
|
"salary_in_usd\n",
|
|||
|
"130000 60\n",
|
|||
|
"150000 59\n",
|
|||
|
"160000 56\n",
|
|||
|
"100000 56\n",
|
|||
|
"120000 52\n",
|
|||
|
" ..\n",
|
|||
|
"83864 1\n",
|
|||
|
"61467 1\n",
|
|||
|
"152380 1\n",
|
|||
|
"73742 1\n",
|
|||
|
"110037 1\n",
|
|||
|
"Name: count, Length: 741, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение salary_in_usd в контрольной выборке:\n",
|
|||
|
"salary_in_usd\n",
|
|||
|
"100000 25\n",
|
|||
|
"150000 20\n",
|
|||
|
"140000 19\n",
|
|||
|
"120000 16\n",
|
|||
|
"135000 16\n",
|
|||
|
" ..\n",
|
|||
|
"247500 1\n",
|
|||
|
"128875 1\n",
|
|||
|
"36000 1\n",
|
|||
|
"121500 1\n",
|
|||
|
"85847 1\n",
|
|||
|
"Name: count, Length: 354, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение salary_in_usd в тестовой выборке:\n",
|
|||
|
"salary_in_usd\n",
|
|||
|
"120000 23\n",
|
|||
|
"150000 19\n",
|
|||
|
"100000 18\n",
|
|||
|
"160000 16\n",
|
|||
|
"200000 13\n",
|
|||
|
" ..\n",
|
|||
|
"75648 1\n",
|
|||
|
"169200 1\n",
|
|||
|
"198800 1\n",
|
|||
|
"219535 1\n",
|
|||
|
"172200 1\n",
|
|||
|
"Name: count, Length: 364, dtype: int64\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"#Разбиение набора данных на обучающую, контрольную и тестовую выборки\n",
|
|||
|
"# Загрузка данных\n",
|
|||
|
"train_df = pd.read_csv(\"..//static//csv//train_data.csv\")\n",
|
|||
|
"val_df = pd.read_csv(\"..//static//csv//val_data.csv\")\n",
|
|||
|
"test_df = pd.read_csv(\"..//static//csv//test_data.csv\")\n",
|
|||
|
"\n",
|
|||
|
"# Разделение на обучающую и тестовую выборки\n",
|
|||
|
"train_df, test_df = train_test_split(df3, test_size=0.2, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение обучающей выборки на обучающую и контрольную\n",
|
|||
|
"train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"print(\"Размер обучающей выборки:\", len(train_df))\n",
|
|||
|
"print(\"Размер контрольной выборки:\", len(val_df))\n",
|
|||
|
"print(\"Размер тестовой выборки:\", len(test_df))\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"def check_balance(df3, name):\n",
|
|||
|
" counts = df3[\"salary_in_usd\"].value_counts()\n",
|
|||
|
" print(f\"Распределение salary_in_usd в {name}:\")\n",
|
|||
|
" print(counts)\n",
|
|||
|
" print()\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"check_balance(train_df, \"обучающей выборке\")\n",
|
|||
|
"check_balance(val_df, \"контрольной выборке\")\n",
|
|||
|
"check_balance(test_df, \"тестовой выборке\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 72,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение salary_in_usd в обучающей выборке после oversampling:\n",
|
|||
|
"salary_in_usd\n",
|
|||
|
"127221 60\n",
|
|||
|
"105000 60\n",
|
|||
|
"100000 60\n",
|
|||
|
"260000 60\n",
|
|||
|
"130000 60\n",
|
|||
|
" ..\n",
|
|||
|
"113900 60\n",
|
|||
|
"110000 60\n",
|
|||
|
"129000 60\n",
|
|||
|
"203000 60\n",
|
|||
|
"200000 60\n",
|
|||
|
"Name: count, Length: 741, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение salary_in_usd в контрольной выборке после oversampling:\n",
|
|||
|
"salary_in_usd\n",
|
|||
|
"99050 25\n",
|
|||
|
"126277 25\n",
|
|||
|
"192500 25\n",
|
|||
|
"194000 25\n",
|
|||
|
"177600 25\n",
|
|||
|
" ..\n",
|
|||
|
"140000 25\n",
|
|||
|
"75000 25\n",
|
|||
|
"165000 25\n",
|
|||
|
"72914 25\n",
|
|||
|
"252000 25\n",
|
|||
|
"Name: count, Length: 354, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение salary_in_usd в тестовой выборке после oversampling:\n",
|
|||
|
"salary_in_usd\n",
|
|||
|
"219000 23\n",
|
|||
|
"143860 23\n",
|
|||
|
"72500 23\n",
|
|||
|
"140000 23\n",
|
|||
|
"66837 23\n",
|
|||
|
" ..\n",
|
|||
|
"109000 23\n",
|
|||
|
"126000 23\n",
|
|||
|
"70186 23\n",
|
|||
|
"199000 23\n",
|
|||
|
"60000 23\n",
|
|||
|
"Name: count, Length: 364, dtype: int64\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"def oversample(df3):\n",
|
|||
|
" X = df3.drop(\"salary_in_usd\", axis=1)\n",
|
|||
|
" y = df3[\"salary_in_usd\"]\n",
|
|||
|
"\n",
|
|||
|
" oversampler = RandomOverSampler(random_state=42)\n",
|
|||
|
" X_resampled, y_resampled = oversampler.fit_resample(X, y) # type: ignore\n",
|
|||
|
"\n",
|
|||
|
" resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
|
|||
|
" return resampled_df\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"train_df_oversampled = oversample(train_df)\n",
|
|||
|
"val_df_oversampled = oversample(val_df)\n",
|
|||
|
"test_df_oversampled = oversample(test_df)\n",
|
|||
|
"\n",
|
|||
|
"check_balance(train_df_oversampled, \"обучающей выборке после oversampling\")\n",
|
|||
|
"check_balance(val_df_oversampled, \"контрольной выборке после oversampling\")\n",
|
|||
|
"check_balance(test_df_oversampled, \"тестовой выборке после oversampling\")"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": "aimenv",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.12.5"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|