1508 lines
405 KiB
Plaintext
1508 lines
405 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"https://www.kaggle.com/datasets/harishkumardatalab/medical-insurance-price-prediction Набор представляет собой данные о мед страховке.\n",
|
|||
|
"Пример цели: Рассчитать стоимость будущей страховки\n",
|
|||
|
"Входные данные: возраст, пол, индекс массы тела, дети, курит ли, регион, стоимость"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 21,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"df = pd.read_csv(\".//static//csv//Medical_insurance.csv\")\n",
|
|||
|
"print(df.columns)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 22,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAx8AAAIjCAYAAABia6bHAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAlL0lEQVR4nO3df5TVdZ348dcMw49JYBBQZHLAX4g/At1VMiJLE91QyVJRNwxJCk3wZ6st2Qq2dvSoG2tuaZ42MUtd8Whk6hFMobNHKn/EmkUIhGILopgOA4L8+nz/aOd+Hfn9Y153Znw8zpkjcz+fe+/rvv043idzP/dWFEVRBAAAQDOrLPcAAADAB4P4AAAAUogPAAAghfgAAABSiA8AACCF+AAAAFKIDwAAIIX4AAAAUogPAAAghfgAAABSiA+AD4CXX345KioqYsqUKeUepSymTJkSFRUV8eyzzzb7fU2aNCkqKiqa/X4AWiPxAbADGp/Evvdr7733juOPPz4ee+yx9HlmzpzZZJb27dvHAQccEKNGjYo///nPu+U+nn766Zg0aVK8/fbbu+X2APjgEh8AO+Fb3/pW3H333fHjH/84rrrqqnjjjTfi5JNPjl/84hdlmeeSSy6Ju+++O+6444445ZRT4r/+679i0KBBsWTJkl2+7aeffjquvfZa8bGdvvnNb8bq1avLPQZAi1RV7gEAWqNhw4bF0UcfXfp+zJgx0atXr7j33nvj1FNPTZ/n2GOPjTPPPDMiIr70pS/FwQcfHJdcckncddddMWHChPR5Psiqqqqiqsr/XgE2x28+AHaDbt26RXV19SZPOletWhVf+9rXoq6uLjp27Bj9+/ePm2++OYqiiIiI1atXxyGHHBKHHHJIk78t/+tf/xq9e/eOj3/847Fhw4YdnufTn/50REQsWrRoq/s9+eSTceyxx8Yee+wR3bp1i9NOOy3mzp1b2j5p0qS48sorIyJi//33L7286+WXX97mDO9/eVrj18yZMzfZd/To0Zvdd9KkSU32e+CBB+Loo4+OLl26NNnv5ptv3uY8ERHvvPNOXHDBBdGjR4/o2rVrjBo1Kt56660m++y3335x6qmnxsyZM+Poo4+O6urqGDBgQGnuBx98MAYMGBCdOnWKo446Kn73u981ub5zPgC2zF/NAOyE+vr6WL58eRRFEa+//nrceuutsXLlyjj33HNL+xRFEZ/97GfjqaeeijFjxsSRRx4Zjz/+eFx55ZXxv//7vzF58uSorq6Ou+66K4YMGRJXX311fOc734mIiHHjxkV9fX1MmTIl2rVrt8PzLVy4MCIievToscV9nnjiiRg2bFgccMABMWnSpFi9enXceuutMWTIkHj++edjv/32i9NPPz1eeumluPfee2Py5MnRs2fPiIjYa6+9tmuOE088MUaNGhUREc8880x897vf3eK+PXv2jMmTJ5e+/+IXv9hk++zZs+Oss86KI444Im644YaoqamJ5cuXx+WXX75ds0REjB8/Prp16xaTJk2KefPmxW233RavvPJK6dyZRgsWLIgvfOELccEFF8S5554bN998cwwfPjxuv/32+MY3vhEXXXRRRERcf/31cdZZZ8W8efOistLf5wFsUwHAdrvzzjuLiNjkq2PHjsWUKVOa7Puzn/2siIjiuuuua3L5mWeeWVRUVBQLFiwoXTZhwoSisrKy+NWvflVMnTq1iIji3//937c5z1NPPVVERPGjH/2oeOONN4olS5YUjzzySLHffvsVFRUVxTPPPFMURVEsWrSoiIjizjvvLF33yCOPLPbee+/izTffLF32P//zP0VlZWUxatSo0mU33XRTERHFokWLtnud1q5dW0REMX78+NJljY/rqaee2mT/kSNHFvvvv3+TyyKimDhxYun7CRMmFBFRLF26tHRZ4+O66aabtjpP47+3o446qli7dm3p8htvvLGIiGLatGmly/r27VtERPH000+XLnv88ceLiCiqq6uLV155pXT5D37wg00e08SJEwv/ewXYPH9NA7ATvve978WMGTNixowZ8ZOf/CSOP/74+PKXvxwPPvhgaZ9HH3002rVrF5dcckmT637ta1+LoiiavDvWpEmT4vDDD4/zzjsvLrroovjUpz61yfW25vzzz4+99toramtr45RTTolVq1bFXXfd1eS8lPdaunRpzJkzJ0aPHh3du3cvXT5w4MA48cQT49FHH93u+96cNWvWREREp06dtmv/tWvXRseOHbe6T0NDQ1RWVka3bt12eq6xY8dG+/btS99/9atfjaqqqk0e72GHHRaDBw8ufX/MMcdExN9eztanT59NLt9d7ywG0NaJD4Cd8NGPfjSGDh0aQ4cOjZEjR8YjjzwShx12WIwfPz7Wrl0bERGvvPJK1NbWRpcuXZpc99BDDy1tb9ShQ4f40Y9+FIsWLYqGhoa48847d+i8gWuuuSZmzJgRTz75ZLzwwguxZMmSTV629F6N992/f/9Nth166KGxfPnyWLVq1Xbf//stX748IiJqamq2a/+33347OnfuvNV9Bg8eHBs3boxLL700Fi5cGMuXL9/kfI1t6devX5PvO3fuHL17997kHJb3BkbE/38cdXV1m718R+cA+KByzgfAblBZWRnHH3983HLLLTF//vw4/PDDd/g2Hn/88Yj4228N5s+fH/vvv/92X3fAgAExdOjQHb7P5tL4ZH6//fbbrv1fe+216Nu371b3Oeecc+L555+PW2+9Ne64445dnHDrtnSezZYuL/7vDQQA2Dq/+QDYTdavXx8REStXroyIiL59+8aSJUuioaGhyX5/+tOfStsbvfDCC/Gtb30rvvSlL8Xf/d3fxZe//OWor69vtlkb73vevHmbbPvTn/4UPXv2jD322CMiYqfeuanxk8S39LKv91q3bl0sWLCg9BuhLamsrIybb745jjvuuOjXr1/pJW87Yv78+U2+X7lyZSxdunS7IwmAXSM+AHaDdevWxfTp06NDhw6lJ9Enn3xybNiwIf7jP/6jyb6TJ0+OioqKGDZsWOm6o0ePjtra2rjllltiypQpsWzZsh16F6cd1bt37zjyyCPjrrvuavLhgS+++GJMnz49Tj755NJljRGyIx8y+MADD0T//v3jkEMO2ea+06ZNi9WrV5feHnhrbr311njyySfjpz/9aQwdOjSGDBmy3TNFRNxxxx2xbt260ve33XZbrF+/vvTvAoDm5WVXADvhscceK/0G4/XXX4977rkn5s+fH//8z/8cXbt2jYiI4cOHx/HHHx9XX311vPzyy3HEEUfE9OnTY9q0aXHZZZfFgQceGBER1113XcyZMyd++ctfRpcuXWLgwIFxzTXXxDe/+c0488wzm4TA7nTTTTfFsGHDYvDgwTFmzJjSW+3W1NQ0+XyNo446KiIirr766jjnnHOiffv2MXz48FKUvNef//znuPHGG+O3v/1tnH766U1+M/HMM89ERMSMGTOiT58+sc8++8TEiRPj+9//fnz84x+Pk046aavz/uEPf4irrroqJk2aFIMGDdqpx7x27do44YQTSm+P+/3vfz8+8YlPxGc/+9mduj0AdlC5324LoDXZ3FvtdurUqTjyyCOL2267rdi4cWOT/RsaGorLL7+8qK2tLdq3b1/069evuOmmm0r7Pffcc0VVVVVx8cUXN7ne+vXri0GDBhW1tbXFW2+9tcV5Gt9qd+rUqVude3NvtVsURfHEE08UQ4YMKaqrq4uuXbsWw4cPL/74xz9ucv1//dd/LT784Q8XlZWVW33b3S29FfH7v+68887iL3/5S1FXV1dcdtllRX19/Sa3Fe95q901a9YUAwcOLD7xiU8U69ev3+Rxbe9b7c6aNasYO3ZsseeeexadO3cuRo4c2eSthovib2+1e8opp2x2nnHjxjW5bHP37612AbasoiicJQfA7jFlypSYNGnSVj8B/bjjjovRo0fH6NGj0+YCoGVwzgcAAJBCfACw2xx44IHx+c9/fqv7nHjiiaXzXQD4YPGyKwAAIIXffAAAACnEBwAAkGKnP+dj48aNsWTJkujSpctOffotAADQNhRFEQ0NDVF
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA2QAAAIjCAYAAABswtioAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAC3PUlEQVR4nOzdeXgUVdo28Dsh6SQQOiFACGtYXJIYQISAUYPLABHigssYlwFEBtCXMCM4LvkkiATNjI4vOIILyIg68II6qEjCNqDCQMTIImICLiwRMYQ1IUhoQvf3R6Z7SFLdfbpOpauq+/5dl9cl3U91Vbqrq89T55znhDgcDgeIiIiIiIjI70L1PgAiIiIiIqJgxYSMiIiIiIhIJ0zIiIiIiIiIdMKEjIiIiIiISCdMyIiIiIiIiHTChIyIiIiIiEgnTMiIiIiIiIh0woSMiIiIiIhIJ0zIiIiIiIiIdMKEjIiIDOvAgQMICQnBokWL9D6UBlavXo0rr7wSkZGRCAkJwalTp/Q+JNO64YYbkJqaqvdhEBHphgkZEZEOvvnmG9x9991ITExEZGQkOnfujKFDh+KVV15ptn0uWbIEc+bMafL44cOHMWPGDOzcubPZ9t3YZ599hpCQENd/4eHh6NmzJ0aPHo19+/Zpso8tW7ZgxowZmidLx48fxz333IOoqCjMmzcP7777Llq1auV1u1dffRUhISEYNGiQpsdjVNXV1Xj22WfRt29fREdHIyoqCqmpqXjyySdx+PBhvQ+PiMgwwvQ+ACKiYLNlyxbceOON6NatG8aPH4+EhAT89NNP+OKLL/Dyyy9j8uTJzbLfJUuWYPfu3Xj00UcbPH748GE8++yz6N69O6688spm2bc7f/jDH5CWlobz589j+/btmD9/PgoLC/HNN9+gU6dOUq+9ZcsWPPvss3jwwQcRGxurzQEDKCkpwenTp5Gfn48hQ4YIb7d48WJ0794dX375JX744Qdccsklmh2T0ezbtw9DhgxBeXk5fvvb32LChAmwWCzYtWsXFi5ciA8//BDfffed3odJRGQITMiIiPzsueeeQ0xMDEpKSpokCpWVlfocVDM4c+aM156jjIwM3H333QCAsWPH4rLLLsMf/vAHvP3228jNzfXHYfrM+Rn5kuTt378fW7ZswfLlyzFx4kQsXrwYzzzzTDMdob7q6upw55134siRI/jss89w3XXXNXj+ueeew1/+8he/H5PdbofFYvHrfomIRHDIIhGRn/3444+44oorFBv08fHxTR77xz/+gYEDB6Jly5Zo06YNBg8ejLVr17qe//jjj5GVlYVOnTohIiICvXr1Qn5+Pi5cuOCKueGGG1BYWIiDBw+6hgl2794dn332GdLS0gDUJ0TO5y6es7V161bcfPPNiImJQcuWLXH99ddj8+bNDY5xxowZCAkJQWlpKe6//360adOmSUNcxE033QSgPoHxZMOGDcjIyECrVq0QGxuL22+/HWVlZQ2O5/HHHwcA9OjRw/V3HThwwOPrvv/+++jfvz+ioqLQrl07/O53v8PPP//sev6GG27AmDFjAABpaWkICQnBgw8+6PXvWrx4Mdq0aYOsrCzcfffdWLx4sWLc8ePHMWrUKFitVsTGxmLMmDH4+uuvFefR7dmzB3fffTfi4uIQGRmJAQMGYMWKFR6P4/z584iLi8PYsWObPFddXY3IyEj86U9/cj32yiuv4IorrnCdewMGDMCSJUs87uOf//wnvv76azz99NOK54DVasVzzz3X5PHS0lLceOONaNmyJTp37owXXnihwfM2mw3Tp09H//79ERMTg1atWiEjIwOffvppgzjnvMO//vWvmDNnDnr16oWIiAiUlpYCqB8uO2DAAERGRqJXr1544403XOdvY//4xz9c50NcXBzuvfde/PTTTx7/fiIiX7GHjIjIzxITE1FcXIzdu3d7LWbw7LPPYsaMGbjmmmswc+ZMWCwWbN26FRs2bMCwYcMAAIsWLUJ0dDSmTp2K6OhobNiwAdOnT0d1dTVefPFFAMDTTz+NqqoqHDp0CLNnzwYAREdHIzk5GTNnzsT06dMxYcIEZGRkAACuueYaAPWJz/Dhw9G/f38888wzCA0NxVtvvYWbbroJmzZtwsCBAxsc729/+1tceumleP755+FwOHx+b3788UcAQNu2bd3G/Otf/8Lw4cPRs2dPzJgxA2fPnsUrr7yCa6+9Ftu3b0f37t1x55134rvvvsP//d//Yfbs2WjXrh0AoH379m5fd9GiRRg7dizS0tJQUFCAI0eO4OWXX8bmzZuxY8cOxMbG4umnn8bll1+O+fPnY+bMmejRowd69erl9e9avHgx7rzzTlgsFtx333147bXXUFJS4kqGAcBut+PWW2/Fl19+iUceeQRJSUn4+OOPXQngxb799ltce+216Ny5M5566im0atUK7733HkaOHIl//vOfuOOOOxSPIzw8HHfccQeWL1+ON954o0GP0UcffYRz587h3nvvBQAsWLAAf/jDH3D33Xfjj3/8I2pra7Fr1y5s3boV999/v9u/1ZkUjho1yuv74nTy5EncfPPNuPPOO3HPPffggw8+wJNPPonevXtj+PDhAOoTxjfffBP33Xcfxo8fj9OnT2PhwoXIzMzEl19+2WS47VtvvYXa2lpMmDABERERiIuLw44dO3DzzTejY8eOePbZZ3HhwgXMnDlT8bx47rnnkJeXh3vuuQe///3vcfToUbzyyisYPHiw63wgItKEg4iI/Grt2rWOFi1aOFq0aOFIT093PPHEE441a9Y4bDZbg7jvv//eERoa6rjjjjscFy5caPCc3W53/f+vv/7aZB8TJ050tGzZ0lFbW+t6LCsry5GYmNgktqSkxAHA8dZbbzXZx6WXXurIzMxssr8ePXo4hg4d6nrsmWeecQBw3HfffULvwaeffuoA4Pj73//uOHr0qOPw4cOOwsJCR/fu3R0hISGOkpISh8PhcOzfv7/JsV155ZWO+Ph4x/Hjx12Pff31147Q0FDH6NGjXY+9+OKLDgCO/fv3ez0em83miI+Pd6SmpjrOnj3renzlypUOAI7p06e7HnvrrbccAFzH6M1XX33lAOBYt26dw+Gof1+7dOni+OMf/9gg7p///KcDgGPOnDmuxy5cuOC46aabmrwHv/nNbxy9e/du8Pna7XbHNddc47j00ks9Hs+aNWscAByffPJJg8dHjBjh6Nmzp+vft99+u+OKK64Q+hsv1q9fP0dMTIxw/PXXX+8A4HjnnXdcj507d86RkJDguOuuu1yP1dXVOc6dO9dg25MnTzo6dOjgeOihh1yPOc8Zq9XqqKysbBB/6623Olq2bOn4+eefXY99//33jrCwMMfFTaIDBw44WrRo4XjuuecabP/NN984wsLCmjxORCSDQxaJiPxs6NChKC4uxm233Yavv/4aL7zwAjIzM9G5c+cGQ84++ugj2O12TJ8+HaGhDS/XFw+vioqKcv3/6dOncezYMWRkZODXX3/Fnj17VB/nzp078f333+P+++/H8ePHcezYMRw7dgxnzpzBb37zG2zcuBF2u73BNg8//LBP+3jooYfQvn17dOrUCVlZWThz5gzefvttDBgwQDH+l19+wc6dO/Hggw8iLi7O9XifPn0wdOhQFBUV+f6HAvjqq69QWVmJ//mf/0FkZKTr8aysLCQlJaGwsFDV6wL1vWMdOnTAjTfeCKD+s8vOzsbSpUsbDCtdvXo1wsPDMX78eNdjoaGhmDRpUoPXO3HiBDZs2IB77rnH9XkfO3YMx48fR2ZmJr7//vsGwywbu+mmm9CuXTssW7bM9djJkyexbt06ZGdnux6LjY3FoUOHUFJS4tPfW11djdatW/u0TXR0NH73u9+5/m2xWDBw4MAGFTdbtGjh6tGz2+04ceIE6urqMGDAAGzfvr3Ja951110Ner4uXLiAf/3rXxg5cmSDgjGXXHKJqxfOafny5bDb7bjnnntc7++xY8eQkJCASy+9tMkwSSIiGRyySESkg7S0NCxfvhw2mw1ff/01PvzwQ8yePRt33303du7ciZSUFPz4448IDQ1FSkq
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Визуализация данных - ящик с усами. Как видим - у выборки есть смещение в меньшую сторону\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"sns.boxplot(x=df[\"bmi\"])\n",
|
|||
|
"plt.title(\"Box Plot для bmi\")\n",
|
|||
|
"plt.xlabel(\"bmi\")\n",
|
|||
|
"plt.show()\n",
|
|||
|
"\n",
|
|||
|
"#Визуализируем отношение стоимости мед счетов и возраста\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"plt.scatter(df[\"age\"], df[\"charges\"])\n",
|
|||
|
"plt.xlabel('Age')\n",
|
|||
|
"plt.ylabel(\"Charge\")\n",
|
|||
|
"plt.title('Scatter Plot of Age vs Charge')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 23,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Выбросы:\n",
|
|||
|
" age sex bmi children smoker region charges\n",
|
|||
|
"14 27 male 42.13 0 yes southeast 39611.7577\n",
|
|||
|
"19 30 male 35.30 0 yes southwest 36837.4670\n",
|
|||
|
"23 34 female 31.92 1 yes northeast 37701.8768\n",
|
|||
|
"29 31 male 36.30 2 yes southwest 38711.0000\n",
|
|||
|
"30 22 male 35.60 0 yes southwest 35585.5760\n",
|
|||
|
"... ... ... ... ... ... ... ...\n",
|
|||
|
"2735 52 male 41.80 2 yes southeast 47269.8540\n",
|
|||
|
"2736 64 male 36.96 2 yes southeast 49577.6624\n",
|
|||
|
"2744 32 male 33.63 1 yes northeast 37607.5277\n",
|
|||
|
"2764 22 female 31.02 3 yes southeast 35595.5898\n",
|
|||
|
"2765 47 male 36.08 1 yes southeast 42211.1382\n",
|
|||
|
"\n",
|
|||
|
"[296 rows x 7 columns]\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA2QAAAIjCAYAAABswtioAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAC3pUlEQVR4nOzdeXhU1fkH8G/2BMIEIoYAYQmghBgiiIhRExGBCHFBUUEtW5HFElvBqs1PQCBiWq0FKyiiVlygoC0qhSBEEEGJGhBETEBZI0gIewgShmHm9wedKUnunTlzz83cuTPfz/PwtM68Z+7NrOe955z3hDgcDgeIiIiIiIjI50KNPgEiIiIiIqJgxYSMiIiIiIjIIEzIiIiIiIiIDMKEjIiIiIiIyCBMyIiIiIiIiAzChIyIiIiIiMggTMiIiIiIiIgMwoSMiIiIiIjIIEzIiIiIiIiIDMKEjIiI/Na+ffsQEhKCBQsWGH0qtXzyySfo1q0boqOjERISgpMnTxp9SqbVu3dvpKWlGX0aRESGYUJGRGSA77//Hvfeey/atWuH6OhotG7dGv369cPLL7/cYMdctGgRZs+eXe/2X375BdOmTcPWrVsb7Nh1rVu3DiEhIa5/ERER6NChA4YPH449e/bocoyNGzdi2rRpuidLx44dw/3334+YmBjMnTsX7777Lho3buyx3SuvvIKQkBD06tVL1/PxV1VVVZg+fTquvvpqxMbGIiYmBmlpaXjqqafwyy+/GH16RER+I9zoEyAiCjYbN27ELbfcgrZt22LMmDFITEzEzz//jK+++govvfQSHn300QY57qJFi7B9+3Y89thjtW7/5ZdfMH36dLRv3x7dunVrkGOr+f3vf4+ePXvi/Pnz+PbbbzF//nysWLEC33//PVq1aiX12Bs3bsT06dMxcuRING3aVJ8TBlBSUoLTp08jPz8fffv2FW63cOFCtG/fHt988w127dqFTp066XZO/mbPnj3o27cvysvLcd9992Hs2LGIjIzEtm3b8Oabb+LDDz/Ejz/+aPRpEhH5BSZkREQ+NnPmTMTFxaGkpKReolBZWWnMSTWAM2fOeBw5yszMxL333gsAGDVqFK688kr8/ve/x9tvv428vDxfnKbXnK+RN0ne3r17sXHjRixduhTjxo3DwoUL8cwzzzTQGRrLZrPhnnvuweHDh7Fu3TrcdNNNte6fOXMm/vKXv/j8nOx2OyIjI316XCIiEZyySETkY7t378ZVV12l2KFPSEiod9t7772H6667Do0aNUKzZs2QlZWF1atXu+7/+OOPkZOTg1atWiEqKgodO3ZEfn4+Lly44Irp3bs3VqxYgf3797umCbZv3x7r1q1Dz549AVxMiJz3Xbpm6+uvv8Ztt92GuLg4NGrUCDfffDO+/PLLWuc4bdo0hISEoLS0FA8++CCaNWtWryMuok+fPgAuJjDurF27FpmZmWjcuDGaNm2Ku+66C2VlZbXO54knngAAJCcnu/6uffv2uX3cDz74AD169EBMTAyaN2+O3/zmNzh48KDr/t69e2PEiBEAgJ49eyIkJAQjR470+HctXLgQzZo1Q05ODu69914sXLhQMe7YsWMYNmwYLBYLmjZtihEjRuC7775TXEe3Y8cO3HvvvYiPj0d0dDSuvfZaLFu2zO15nD9/HvHx8Rg1alS9+6qqqhAdHY0//vGPrttefvllXHXVVa733rXXXotFixa5Pca///1vfPfdd3j66acV3wMWiwUzZ86sd3tpaSluueUWNGrUCK1bt8bzzz9f636r1YqpU6eiR48eiIuLQ+PGjZGZmYnPPvusVpxz3eFf//pXzJ49Gx07dkRUVBRKS0sBXJwue+211yI6OhodO3bEa6+95nr/1vXee++53g/x8fEYOnQofv75Z7d/PxGRtzhCRkTkY+3atUNxcTG2b9/usZjB9OnTMW3aNNxwww2YMWMGIiMj8fXXX2Pt2rXo378/AGDBggWIjY3FpEmTEBsbi7Vr12Lq1KmoqqrCCy+8AAB4+umncerUKRw4cACzZs0CAMTGxqJLly6YMWMGpk6dirFjxyIzMxMAcMMNNwC4mPgMGDAAPXr0wDPPPIPQ0FC89dZb6NOnDzZs2IDrrruu1vned999uOKKK/Dcc8/B4XB4/dzs3r0bAHDZZZepxnz66acYMGAAOnTogGnTpuHs2bN4+eWXceONN+Lbb79F+/btcc899+DHH3/EP//5T8yaNQvNmzcHAFx++eWqj7tgwQKMGjUKPXv2REFBAQ4fPoyXXnoJX375JbZs2YKmTZvi6aefRufOnTF//nzMmDEDycnJ6Nixo8e/a+HChbjnnnsQGRmJBx54AK+++ipKSkpcyTAA2O123HHHHfjmm2/wyCOPICUlBR9//LErAbzUDz/8gBtvvBGtW7fGn/70JzRu3Bjvv/8+Bg0ahH//+9+4++67Fc8jIiICd999N5YuXYrXXnut1ojRRx99hHPnzmHo0KEAgNdffx2///3vce+99+IPf/gDampqsG3bNnz99dd48MEHVf9WZ1I4bNgwj8+L04kTJ3Dbbbfhnnvuwf33349//etfeOqpp9C1a1cMGDAAwMWE8Y033sADDzyAMWPG4PTp03jzzTeRnZ2Nb775pt5027feegs1NTUYO3YsoqKiEB8fjy1btuC2225Dy5YtMX36dFy4cAEzZsxQfF/MnDkTU6ZMwf3334+HH34YR44cwcsvv4ysrCzX+4GISBcOIiLyqdWrVzvCwsIcYWFhjoyMDMeTTz7pWLVqlcNqtdaK++mnnxyhoaGOu+++23HhwoVa99ntdtf///XXX+sdY9y4cY5GjRo5ampqXLfl5OQ42rVrVy+2pKTEAcDx1ltv1TvGFVdc4cjOzq53vOTkZEe/fv1ctz3zzDMOAI4HHnhA6Dn47LPPHAAc//jHPxxHjhxx/PLLL44VK1Y42rdv7wgJCXGUlJQ4HA6HY+/evfXOrVu3bo6EhATHsWPHXLd99913jtDQUMfw4cNdt73wwgsOAI69e/d6PB+r1epISEhwpKWlOc6ePeu6ffny5Q4AjqlTp7pue+uttxwAXOfoyaZNmxwAHEVFRQ6H4+LzmpSU5PjDH/5QK+7f//63A4Bj9uzZrtsuXLjg6NOnT73n4NZbb3V07dq11utrt9sdN9xwg+OKK65wez6rVq1yAHD85z//qXX7wIEDHR06dHD991133eW46qqrhP7GS3Xv3t0RFxcnHH/zzTc7ADjeeecd123nzp1zJCYmOgYPHuy6zWazOc6dO1er7YkTJxwtWrRw/Pa3v3Xd5nzPWCwWR2VlZa34O+64w9GoUSPHwYMHXbf99NNPjvDwcMelXaJ9+/Y5wsLCHDNnzqzV/vvvv3eEh4fXu52ISAanLBIR+Vi/fv1QXFyMO++8E9999x2ef/55ZGdno3Xr1rWmnH300Uew2+2YOnUqQkNrf11fOr0qJibG9f9Pnz6No0ePIjMzE7/++it27Nih+Ty3bt2Kn376CQ8++CCOHTuGo0eP4ujRozhz5gxuvfVWrF+/Hna7vVab8ePHe3WM3/72t7j88svRqlUr5OTk4MyZM3j77bdx7bXXKsYfOnQIW7duxciRIxEfH++6PT09Hf369UNhYaH3fyiATZs2obKyEr/73e8QHR3tuj0nJwcpKSlYsWKFpscFLo6OtWjRArfccguAi6/dkCFDsHjx4lrTSj/55BNERERgzJgxrttCQ0MxYcKEWo93/PhxrF27Fvfff7/r9T569CiOHTuG7Oxs/PTTT7WmWdbVp08fNG/eHEuWLHHdduLECRQVFWHIkCGu25o2bYoDBw6gpKTEq7+3qqoKTZo08apNbGwsfvOb37j+OzIyEtddd12tipthYWGuET273Y7jx4/DZrPh2muvxbffflvvMQcPHlxr5OvChQv49NNPMWjQoFoFYzp16uQahXNaunQp7HY77r//ftfze/ToUSQmJuKKK66oN02SiEgGpywSERmgZ8+eWLp0KaxWK7777jt8+OGHmDVrFu69915s3boVqamp2L17N0JDQ5Gamur2sX744Qd
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"\n",
|
|||
|
"# Статистический анализ для определения выбросов\n",
|
|||
|
"Q1 = df[\"charges\"].quantile(0.25)\n",
|
|||
|
"Q3 = df[\"charges\"].quantile(0.75)\n",
|
|||
|
"IQR = Q3 - Q1\n",
|
|||
|
"\n",
|
|||
|
"# Определение порога для выбросов\n",
|
|||
|
"threshold = 1.5 * IQR\n",
|
|||
|
"outliers = (df[\"charges\"] < (Q1 - threshold)) | (df[\"charges\"] > (Q3 + threshold))\n",
|
|||
|
"\n",
|
|||
|
"# Вывод выбросов\n",
|
|||
|
"print(\"Выбросы:\")\n",
|
|||
|
"print(df[outliers])\n",
|
|||
|
"\n",
|
|||
|
"# Обработка выбросов\n",
|
|||
|
"# В данном случае мы обнулим выбросы\n",
|
|||
|
"median_charge = df[\"charges\"].median()\n",
|
|||
|
"df.loc[outliers, \"charges\"] = 0\n",
|
|||
|
"df = df[df.charges != 0]\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация данных после обработки\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"plt.scatter(df[\"age\"], df[\"charges\"])\n",
|
|||
|
"plt.xlabel(\"Age\")\n",
|
|||
|
"plt.ylabel(\"Charge\")\n",
|
|||
|
"plt.title(\"Scatter Plot of Age vs Charge\")\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Теперь создадим выборки."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 24,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки: 1485\n",
|
|||
|
"Размер контрольной выборки: 495\n",
|
|||
|
"Размер тестовой выборки: 496\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"\n",
|
|||
|
"train_df, temp_df = train_test_split(df, test_size=0.4, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение остатка на контрольную и тестовую выборки\n",
|
|||
|
"val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Проверка размеров выборок\n",
|
|||
|
"print(\"Размер обучающей выборки:\", len(train_df))\n",
|
|||
|
"print(\"Размер контрольной выборки:\", len(val_df))\n",
|
|||
|
"print(\"Размер тестовой выборки:\", len(test_df))\n",
|
|||
|
"\n",
|
|||
|
"# Сохранение выборок в файлы\n",
|
|||
|
"train_df.to_csv(\".//static//csv//train_data.csv\", index=False)\n",
|
|||
|
"val_df.to_csv(\".//static//csv//val_data.csv\", index=False)\n",
|
|||
|
"test_df.to_csv(\".//static//csv//test_data.csv\", index=False)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Проанализируем сбалансированность выборок"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 25,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение Review_type в обучающей выборке:\n",
|
|||
|
"sex\n",
|
|||
|
"female 765\n",
|
|||
|
"male 720\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент мужчин: 48.48%\n",
|
|||
|
"Процент женщин: 51.52%\n",
|
|||
|
"\n",
|
|||
|
"Распределение Review_type в контрольной выборке:\n",
|
|||
|
"sex\n",
|
|||
|
"male 257\n",
|
|||
|
"female 238\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент мужчин: 51.92%\n",
|
|||
|
"Процент женщин: 48.08%\n",
|
|||
|
"\n",
|
|||
|
"Распределение Review_type в тестовой выборке:\n",
|
|||
|
"sex\n",
|
|||
|
"female 259\n",
|
|||
|
"male 237\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент мужчин: 47.78%\n",
|
|||
|
"Процент женщин: 52.22%\n",
|
|||
|
"\n",
|
|||
|
"Аугментация данных не требуется.\n",
|
|||
|
"Аугментация данных не требуется.\n",
|
|||
|
"Аугментация данных не требуется.\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"train_df = pd.read_csv(\".//static//csv//train_data.csv\")\n",
|
|||
|
"val_df = pd.read_csv(\".//static//csv//val_data.csv\")\n",
|
|||
|
"test_df = pd.read_csv(\".//static//csv//test_data.csv\")\n",
|
|||
|
"\n",
|
|||
|
"# Оценка сбалансированности\n",
|
|||
|
"def check_balance(df, name):\n",
|
|||
|
" counts = df['sex'].value_counts()\n",
|
|||
|
" print(f\"Распределение Review_type в {name}:\")\n",
|
|||
|
" print(counts)\n",
|
|||
|
" print(f\"Процент мужчин: {counts['male'] / len(df) * 100:.2f}%\")\n",
|
|||
|
" print(f\"Процент женщин: {counts['female'] / len(df) * 100:.2f}%\")\n",
|
|||
|
" print()\n",
|
|||
|
"\n",
|
|||
|
"# Определение необходимости аугментации данных\n",
|
|||
|
"def need_augmentation(df):\n",
|
|||
|
" counts = df['sex'].value_counts()\n",
|
|||
|
" ratio = counts['male'] / counts['female']\n",
|
|||
|
" if ratio > 1.5 or ratio < 0.67:\n",
|
|||
|
" print(\"Необходима аугментация данных для балансировки классов.\")\n",
|
|||
|
" else:\n",
|
|||
|
" print(\"Аугментация данных не требуется.\")\n",
|
|||
|
" \n",
|
|||
|
"check_balance(train_df, \"обучающей выборке\")\n",
|
|||
|
"check_balance(val_df, \"контрольной выборке\")\n",
|
|||
|
"check_balance(test_df, \"тестовой выборке\")\n",
|
|||
|
"\n",
|
|||
|
"need_augmentation(train_df)\n",
|
|||
|
"need_augmentation(val_df)\n",
|
|||
|
"need_augmentation(test_df)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"По результатам анализа требуется приращение, соотношения вне допустимого диапазона"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 26,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Оверсэмплинг:\n",
|
|||
|
"Распределение sex в обучающей выборке:\n",
|
|||
|
"sex\n",
|
|||
|
"male 765\n",
|
|||
|
"female 765\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент мужчин: 50.00%\n",
|
|||
|
"Процент женщин: 50.00%\n",
|
|||
|
"\n",
|
|||
|
"Распределение sex в контрольной выборке:\n",
|
|||
|
"sex\n",
|
|||
|
"male 257\n",
|
|||
|
"female 257\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент мужчин: 50.00%\n",
|
|||
|
"Процент женщин: 50.00%\n",
|
|||
|
"\n",
|
|||
|
"Распределение sex в тестовой выборке:\n",
|
|||
|
"sex\n",
|
|||
|
"female 259\n",
|
|||
|
"male 259\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент мужчин: 50.00%\n",
|
|||
|
"Процент женщин: 50.00%\n",
|
|||
|
"\n",
|
|||
|
"Андерсэмплинг:\n",
|
|||
|
"Распределение sex в обучающей выборке:\n",
|
|||
|
"sex\n",
|
|||
|
"female 720\n",
|
|||
|
"male 720\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент мужчин: 50.00%\n",
|
|||
|
"Процент женщин: 50.00%\n",
|
|||
|
"\n",
|
|||
|
"Распределение sex в контрольной выборке:\n",
|
|||
|
"sex\n",
|
|||
|
"female 238\n",
|
|||
|
"male 238\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент мужчин: 50.00%\n",
|
|||
|
"Процент женщин: 50.00%\n",
|
|||
|
"\n",
|
|||
|
"Распределение sex в тестовой выборке:\n",
|
|||
|
"sex\n",
|
|||
|
"female 237\n",
|
|||
|
"male 237\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"Процент мужчин: 50.00%\n",
|
|||
|
"Процент женщин: 50.00%\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"from imblearn.over_sampling import RandomOverSampler\n",
|
|||
|
"from imblearn.under_sampling import RandomUnderSampler\n",
|
|||
|
"from sklearn.preprocessing import LabelEncoder\n",
|
|||
|
"\n",
|
|||
|
"# Загрузка данных\n",
|
|||
|
"train_df = pd.read_csv(\".//static//csv//train_data.csv\")\n",
|
|||
|
"val_df = pd.read_csv(\".//static//csv//val_data.csv\")\n",
|
|||
|
"test_df = pd.read_csv(\".//static//csv//test_data.csv\")\n",
|
|||
|
"\n",
|
|||
|
"# Преобразование категориальных признаков в числовые\n",
|
|||
|
"def encode(df):\n",
|
|||
|
" label_encoders = {}\n",
|
|||
|
" for column in df.select_dtypes(include=['object']).columns:\n",
|
|||
|
" if column != 'sex': # Пропускаем целевую переменную\n",
|
|||
|
" le = LabelEncoder()\n",
|
|||
|
" df[column] = le.fit_transform(df[column])\n",
|
|||
|
" label_encoders[column] = le\n",
|
|||
|
" return label_encoders\n",
|
|||
|
"\n",
|
|||
|
"# Преобразование целевой переменной в числовые значения\n",
|
|||
|
"def encode_target(df):\n",
|
|||
|
" le = LabelEncoder()\n",
|
|||
|
" df['sex'] = le.fit_transform(df['sex'])\n",
|
|||
|
" return le\n",
|
|||
|
"\n",
|
|||
|
"# Применение кодирования\n",
|
|||
|
"label_encoders = encode(train_df)\n",
|
|||
|
"encode(val_df)\n",
|
|||
|
"encode(test_df)\n",
|
|||
|
"\n",
|
|||
|
"# Кодирование целевой переменной\n",
|
|||
|
"le_target = encode_target(train_df)\n",
|
|||
|
"encode_target(val_df)\n",
|
|||
|
"encode_target(test_df)\n",
|
|||
|
"\n",
|
|||
|
"# Проверка типов данных\n",
|
|||
|
"def check_data_types(df):\n",
|
|||
|
" for column in df.columns:\n",
|
|||
|
" if df[column].dtype == 'object':\n",
|
|||
|
" print(f\"Столбец '{column}' содержит строковые данные.\")\n",
|
|||
|
"\n",
|
|||
|
"check_data_types(train_df)\n",
|
|||
|
"check_data_types(val_df)\n",
|
|||
|
"check_data_types(test_df)\n",
|
|||
|
"\n",
|
|||
|
"# Функция для выполнения oversampling\n",
|
|||
|
"def oversample(df):\n",
|
|||
|
" if 'sex' not in df.columns:\n",
|
|||
|
" print(\"Столбец 'sex' отсутствует.\")\n",
|
|||
|
" return df\n",
|
|||
|
" \n",
|
|||
|
" X = df.drop('sex', axis=1)\n",
|
|||
|
" y = df['sex']\n",
|
|||
|
" \n",
|
|||
|
" oversampler = RandomOverSampler(random_state=42)\n",
|
|||
|
" X_resampled, y_resampled = oversampler.fit_resample(X, y) # type: ignore\n",
|
|||
|
" \n",
|
|||
|
" resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
|
|||
|
" return resampled_df\n",
|
|||
|
"\n",
|
|||
|
"# Функция для выполнения undersampling\n",
|
|||
|
"def undersample(df):\n",
|
|||
|
" if 'sex' not in df.columns:\n",
|
|||
|
" print(\"Столбец 'sex' отсутствует.\")\n",
|
|||
|
" return df\n",
|
|||
|
" \n",
|
|||
|
" X = df.drop('sex', axis=1)\n",
|
|||
|
" y = df['sex']\n",
|
|||
|
" \n",
|
|||
|
" undersampler = RandomUnderSampler(random_state=42)\n",
|
|||
|
" X_resampled, y_resampled = undersampler.fit_resample(X, y) # type: ignore\n",
|
|||
|
" \n",
|
|||
|
" resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
|
|||
|
" return resampled_df\n",
|
|||
|
"\n",
|
|||
|
"# Применение oversampling и undersampling к каждой выборке\n",
|
|||
|
"train_df_oversampled = oversample(train_df)\n",
|
|||
|
"val_df_oversampled = oversample(val_df)\n",
|
|||
|
"test_df_oversampled = oversample(test_df)\n",
|
|||
|
"\n",
|
|||
|
"train_df_undersampled = undersample(train_df)\n",
|
|||
|
"val_df_undersampled = undersample(val_df)\n",
|
|||
|
"test_df_undersampled = undersample(test_df)\n",
|
|||
|
"\n",
|
|||
|
"# Обратное преобразование целевой переменной в строковые метки\n",
|
|||
|
"def decode_target(df, le_target):\n",
|
|||
|
" df['sex'] = le_target.inverse_transform(df['sex'])\n",
|
|||
|
"\n",
|
|||
|
"decode_target(train_df_oversampled, le_target)\n",
|
|||
|
"decode_target(val_df_oversampled, le_target)\n",
|
|||
|
"decode_target(test_df_oversampled, le_target)\n",
|
|||
|
"\n",
|
|||
|
"decode_target(train_df_undersampled, le_target)\n",
|
|||
|
"decode_target(val_df_undersampled, le_target)\n",
|
|||
|
"decode_target(test_df_undersampled, le_target)\n",
|
|||
|
"\n",
|
|||
|
"# Проверка результатов\n",
|
|||
|
"def check_balance(df, name):\n",
|
|||
|
" if 'sex' not in df.columns:\n",
|
|||
|
" print(f\"Столбец 'sex' отсутствует в {name}.\")\n",
|
|||
|
" return\n",
|
|||
|
" \n",
|
|||
|
" counts = df['sex'].value_counts()\n",
|
|||
|
" print(f\"Распределение sex в {name}:\")\n",
|
|||
|
" print(counts)\n",
|
|||
|
" \n",
|
|||
|
" if 'male' in counts and 'female' in counts:\n",
|
|||
|
" print(f\"Процент мужчин: {counts['male'] / len(df) * 100:.2f}%\")\n",
|
|||
|
" print(f\"Процент женщин: {counts['female'] / len(df) * 100:.2f}%\")\n",
|
|||
|
" else:\n",
|
|||
|
" print(\"Отсутствуют один или оба класса (male/female.\")\n",
|
|||
|
" print()\n",
|
|||
|
"\n",
|
|||
|
"# Проверка сбалансированности после oversampling\n",
|
|||
|
"print(\"Оверсэмплинг:\")\n",
|
|||
|
"check_balance(train_df_oversampled, \"обучающей выборке\")\n",
|
|||
|
"check_balance(val_df_oversampled, \"контрольной выборке\")\n",
|
|||
|
"check_balance(test_df_oversampled, \"тестовой выборке\")\n",
|
|||
|
"\n",
|
|||
|
"# Проверка сбалансированности после undersampling\n",
|
|||
|
"print(\"Андерсэмплинг:\")\n",
|
|||
|
"check_balance(train_df_undersampled, \"обучающей выборке\")\n",
|
|||
|
"check_balance(val_df_undersampled, \"контрольной выборке\")\n",
|
|||
|
"check_balance(test_df_undersampled, \"тестовой выборке\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Stroke Prediction Dataset\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset Датасет инсультов\n",
|
|||
|
"Цель: выжить\n",
|
|||
|
"Входные данные: пол, возраст, женат ли, есть ли заболевания сердца"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 27,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
" id gender age hypertension heart_disease ever_married \\\n",
|
|||
|
"0 9046 Male 67 0 1 Yes \n",
|
|||
|
"1 51676 Female 61 0 0 Yes \n",
|
|||
|
"2 31112 Male 80 0 1 Yes \n",
|
|||
|
"3 60182 Female 49 0 0 Yes \n",
|
|||
|
"4 1665 Female 79 1 0 Yes \n",
|
|||
|
"... ... ... ... ... ... ... \n",
|
|||
|
"5105 18234 Female 80 1 0 Yes \n",
|
|||
|
"5106 44873 Female 81 0 0 Yes \n",
|
|||
|
"5107 19723 Female 35 0 0 Yes \n",
|
|||
|
"5108 37544 Male 51 0 0 Yes \n",
|
|||
|
"5109 44679 Female 44 0 0 Yes \n",
|
|||
|
"\n",
|
|||
|
" work_type Residence_type avg_glucose_level bmi smoking_status \\\n",
|
|||
|
"0 Private Urban 228.69 36.6 formerly smoked \n",
|
|||
|
"1 Self-employed Rural 202.21 NaN never smoked \n",
|
|||
|
"2 Private Rural 105.92 32.5 never smoked \n",
|
|||
|
"3 Private Urban 171.23 34.4 smokes \n",
|
|||
|
"4 Self-employed Rural 174.12 24.0 never smoked \n",
|
|||
|
"... ... ... ... ... ... \n",
|
|||
|
"5105 Private Urban 83.75 NaN never smoked \n",
|
|||
|
"5106 Self-employed Urban 125.20 40.0 never smoked \n",
|
|||
|
"5107 Self-employed Rural 82.99 30.6 never smoked \n",
|
|||
|
"5108 Private Rural 166.29 25.6 formerly smoked \n",
|
|||
|
"5109 Govt_job Urban 85.28 26.2 Unknown \n",
|
|||
|
"\n",
|
|||
|
" stroke \n",
|
|||
|
"0 1 \n",
|
|||
|
"1 1 \n",
|
|||
|
"2 1 \n",
|
|||
|
"3 1 \n",
|
|||
|
"4 1 \n",
|
|||
|
"... ... \n",
|
|||
|
"5105 0 \n",
|
|||
|
"5106 0 \n",
|
|||
|
"5107 0 \n",
|
|||
|
"5108 0 \n",
|
|||
|
"5109 0 \n",
|
|||
|
"\n",
|
|||
|
"[5110 rows x 12 columns]\n",
|
|||
|
"Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',\n",
|
|||
|
" 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',\n",
|
|||
|
" 'smoking_status', 'stroke'],\n",
|
|||
|
" dtype='object')\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"df = pd.read_csv(\".//static//csv//healthcare-dataset-stroke-data.csv\", sep=\",\")\n",
|
|||
|
"\n",
|
|||
|
"df[\"age\"] = df[\"age\"].astype(int)\n",
|
|||
|
"print(df)\n",
|
|||
|
"df[\"age\"].dtype\n",
|
|||
|
"\n",
|
|||
|
"print(df.columns)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 28,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',\n",
|
|||
|
" 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],\n",
|
|||
|
" dtype='object')\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"import matplotlib.pyplot as plt \n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"df = pd.read_csv(\".//static//csv//diabetes.csv\")\n",
|
|||
|
"print(df.columns)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Анализируем датафрейм при помощи \"ящика с усами\". Естьсмещение в сторону меньших значений, это можно исправить при помощи oversampling и undersampling."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 29,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAx8AAAIjCAYAAABia6bHAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAqjElEQVR4nO3de5hVdb348c9cmGFSGFQUGLkIaqAkSkqGhEbeUjRMJS1QCT2YQopZiQdTLD2al2OpKdBRsSBNzCQTjuINj8Yp0AfxUoA6XlEIkYs4yGXW74/zYz/uuM0gfDczvF7Psx+dtb57zXf4tmze7LX2LsqyLAsAAIBtrLjQEwAAAHYM4gMAAEhCfAAAAEmIDwAAIAnxAQAAJCE+AACAJMQHAACQhPgAAACSEB8AAEAS4gOgkXvjjTeiqKgoxo0bV+ipALCDEx8AdTRu3LgoKirKe+yxxx7Rp0+fmDJlSvL5PPXUU3lzadKkSXTq1CnOPPPMeP3117fK9/jLX/4So0aNiiVLlmyV4xXa5MmTo6ioKKqqqqK2trbQ0wHY4ZQWegIADc1Pf/rT6NixY2RZFgsWLIhx48bF8ccfHw899FCccMIJyedzwQUXRI8ePWL16tXx/PPPx9ixY+Phhx+OF198Maqqqj7Tsf/yl7/ElVdeGYMGDYoWLVpsnQkX0IQJE2KvvfaKN954I5544ok46qijCj0lgB2KVz4A6um4446LgQMHxhlnnBE//OEP43/+53+iSZMmcc899xRkPr17946BAwfGd7/73bjlllvihhtuiMWLF8fdd99dkPlsr1asWBGTJk2KH/zgB9G9e/eYMGFCoacEsMMRHwCfUYsWLaKioiJKS/NfTF6xYkVcfPHF0a5duygvL4/OnTvHDTfcEFmWRURETU1NdOnSJbp06RI1NTW55y1evDjatGkThx12WKxdu7be8/na174WERHV1dWbHPfEE09E7969Y6eddooWLVpEv3794u9//3tu/6hRo+JHP/pRRER07Ngxd3nXG2+8sdk5/OvlaeseTz311HpjBw0atMGxo0aNyht3//33xyGHHBLNmjXLG3fDDTdsdj4REX/84x+jpqYm+vfvH6effno88MADsXLlyvXG1dTUxAUXXBAtW7aMZs2axTe+8Y149913Nzind999NwYPHhytWrWK8vLy6Nq1a9x55511mg/AjshlVwD1tHTp0li0aFFkWRYLFy6MW265JT766KMYOHBgbkyWZfGNb3wjnnzyyTj77LPjoIMOikceeSR+9KMfxbvvvhs33XRTVFRUxN133x29evWKkSNHxn/+539GRMTQoUNj6dKlMW7cuCgpKan3/F577bWIiNhtt902Ouaxxx6L4447Ljp16hSjRo2KmpqauOWWW6JXr17x/PPPx1577RUnn3xyzJ07N+6555646aabomXLlhERsfvuu9dpHkcffXSceeaZERExY8aMuPnmmzc6tmXLlnHTTTflvj7jjDPy9k+fPj2+9a1vxYEHHhjXXnttVFZWxqJFi+Kiiy6q01wi/u+Sqz59+kTr1q3j9NNPjxEjRsRDDz0U/fv3zxs3aNCguO++++KMM86IL3/5yzFt2rTo27fvesdbsGBBfPnLX46ioqIYNmxY7L777jFlypQ4++yzY9myZTF8+PA6zw1gh5EBUCd33XVXFhHrPcrLy7Nx48bljX3wwQeziMiuuuqqvO2nnnpqVlRUlL366qu5bZdeemlWXFycPf3009nEiROziMh+8YtfbHY+Tz75ZBYR2Z133pn985//zObPn589/PDD2V577ZUVFRVlM2bMyLIsy6qrq7OIyO66667ccw866KBsjz32yD744IPcthdeeCErLi7OzjzzzNy266+/PouIrLq6us5/TqtWrcoiIhs2bFhu27qf68knn1xv/IABA7KOHTvmbYuI7Iorrsh9femll2YRkb333nu5bet+ruuvv36zc1qwYEFWWlqa/frXv85tO+yww7J+/frljXvuueeyiMiGDx+et33QoEHrzenss8/O2rRpky1atChv7Omnn55VVlZmH3/88WbnBbCjcdkVQD396le/iqlTp8bUqVNj/Pjx0adPnzjnnHPigQceyI2ZPHlylJSUxAUXXJD33IsvvjiyLMt7d6xRo0ZF165d46yzzorzzz8/jjjiiPWetymDBw+O3XffPaqqqqJv376xYsWKuPvuu+OQQw7Z4Pj33nsvZs2aFYMGDYpdd901t71bt25x9NFHx+TJk+v8vTdk3aVMTZs2rdP4VatWRXl5+SbHLF++PIqLi7f4pvd77703iouL45RTTslt+/a3vx1TpkyJDz/8MLftv//7vyMi4vzzz897/ve///28r7Msiz/84Q9x4oknRpZlsWjRotzj2GOPjaVLl8bzzz+/RXMFaMxcdgVQT1/60pfyfrH/9re/Hd27d49hw4bFCSecEGVlZfHmm29GVVVVNGvWLO+5++23X0REvPnmm7ltZWVlceedd0aPHj2iadOmcdddd0VRUVGd53P55ZdH7969o6SkJFq2bBn77bffeveffNq67925c+f19u23337xyCOPxIoVK2KnnXaq8xw+bdGiRRERUVlZWafxS5YsiZ133nmTY3r27Bm33nprXHjhhfHjH/84Kisr86Jhc8aPHx9f+tKX4oMPPogPPvggIiK6d+8eq1atiokTJ8aQIUMi4v/+bIqLi6Njx455z99nn33yvv7nP/8ZS5YsibFjx8bYsWM3+D0XLlxY5/kB7CjEB8BnVFxcHH369Ilf/vKXMW/evOjatWu9j/HII49ExP+9ajBv3rz1fvndlAMOOGC7esvYdTek77XXXnUa//7770eHDh02Oeb000+P559/Pm655ZaN/rK/MfPmzYsZM2ZERMS+++673v4JEybk4qOu1n1GyMCBA+Oss87a4Jhu3brV65gAOwLxAbAVrFmzJiIiPvroo4iI6NChQzz22GOxfPnyvFc//vGPf+T2rzN79uz46U9/Gt/97ndj1qxZcc4558SLL75Y51cO6mvd954zZ856+/7xj39Ey5Ytc6961OcVmHVmzpwZEbHRy74+bfXq1fHqq6/G17/+9U2OKy4ujhtuuCFefPHFqK6ujttuuy0WLFiQd5P/xkyYMCGaNGkSv/3tb9e7gf+ZZ56Jm2++Od56661o3759dOjQIWpra6O6ujovVF599dW85+2+++7RrFmzWLt27XYVfgDbO/d8AHxGq1evjkcffTTKyspyl1Udf/zxsXbt2rj11lvzxt50001RVFQUxx13XO65gwYNiqqqqvjlL38Z48aNiwULFtTrXZzqq02bNnHQQQfF3XffnffJ5S+99FI8+uijcfzxx+e2rYuQ+nzC+f333x+dO3eOLl26bHbspEmToqamJvf2wJtyyy23xBNPPBETJkyIo446Knr16lWn+UyYMCF69+4dp512Wpx66ql5j3VvJbzuM1qOPfbYiIi47bbb1vven1ZSUhKnnHJK/OEPf4iXXnppve/5z3/+s05zA9jReOUDoJ6mTJmSewVj4cKF8bvf/S7mzZsXI0aMiObNm0dExIknnhh9+vSJkSNHxhtvvBEHHnhgPProozFp0qQYPnx47L333hERcdVVV8WsWbPi8ccfj2bNmkW3bt3i8ssvj8suuyxOPfXUvBDYmq6//vo47rjjomfPnnH22Wfn3mq3srIy77MsDj744IiIGDlyZJx++unRpEmTOPHEEzd4P8jrr78e1113Xfztb3+Lk08+OcaPH5/bt+6yp6lTp0b79u2jdevWccUVV8Rtt90Whx12WBxzzDGbnO/LL78cP/7xj2PUqFHRo0ePOv+cf/3rX+PVV1+NYcOGbXD/nnvuGV/84hdjwoQJcckll8TBBx8cp5xySvziF7+IDz74IPdWu3Pnzo2I/FeCrr322njyySfj0EMPjX/7t3+L/fffPxYvXhzPP/98PPbYY7F48eI6zxNgh1Hgd9sCaDA29Fa7TZs2zQ466KDs9ttvz2pra/PGL1++PLvooouyqqqqrEmTJtm+++6bXX/99blxzz33XFZ
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"\n",
|
|||
|
"# Box plot для столбца возраст\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"sns.boxplot(x=df['Age'])\n",
|
|||
|
"plt.title('Box Plot для Age')\n",
|
|||
|
"plt.xlabel('Age')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 30,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA04AAAIjCAYAAAA0vUuxAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACyfklEQVR4nOzdeXhTVfoH8G9SugItLaW0rC2FEUoZ9qUgosgqAyI6Io4CioziLo4LMyOLzIjouI06IPhTHFBRR1RQqCKggBZBWQQBRSig0IK0tIVCW2ju749yQ9JmOecm9+Ym+X6eh+ehyentyc3NzXnP8h6LoigKiIiIiIiIyC1roCtARERERERkdgyciIiIiIiIvGDgRERERERE5AUDJyIiIiIiIi8YOBEREREREXnBwImIiIiIiMgLBk5EREREREReMHAiIiIiIiLygoETERERERGRFwyciIguOHjwICwWCxYtWhToqjjJzc1Fly5dEBMTA4vFgpKSkkBXKah98cUXsFgs+OKLL6R/d+LEiWjQoIH/KxUiJk6ciPT0dKfHLBYLZs6cGZD6hDpX55uI9MPAiSgM7Ny5E9dddx1at26NmJgYNG/eHIMHD8aLL76o299866238Pzzz9d5/OjRo5g5cya2b9+u29+uTW0oq/8iIyPRpk0bjB8/HgcOHPDL3/j6668xc+ZMvwc1RUVFuP766xEbG4uXX34ZixcvRv369b3+3n/+8x9YLBb07t3br/UxGzXYdXxvk5OT0bdvX/z1r3/F4cOHA11Fr/7zn//oHqwb8RkIdunp6U7nyPFfRUVFwOoViHsmEblWL9AVICJ9ff3117jiiivQqlUrTJ48Gampqfjll1+wadMmvPDCC7jnnnt0+btvvfUWdu3ahfvvv9/p8aNHj2LWrFlIT09Hly5ddPnb7tx7773o2bMnzp07h61bt2LBggX45JNPsHPnTjRr1synY3/99deYNWsWJk6ciEaNGvmnwgC2bNmCU6dOYfbs2Rg0aJDw77355ptIT0/H5s2b8fPPP6Nt27Z+q5MZjRs3DldddRVsNhtOnjyJLVu24Pnnn8cLL7yA//u//8MNN9xgL3vZZZfh7NmziIqKCmCNL/rPf/6D5ORkTJw4Ufe/pednwJ2zZ8+iXr3gaG506dIFDz74YJ3HA3mteLpnLly4EDabLTAVIwpDwXEnIyLN/vnPfyIhIQFbtmyp06A/fvx4YCqlg/Lycq8jMf3798d1110HALjlllvwu9/9Dvfeey/eeOMNTJs2zYhqSlPfI5lgLD8/H19//TWWLVuG22+/HW+++SZmzJihUw3NoVu3brjpppucHjt06BCGDBmCCRMmoEOHDujcuTMAwGq1IiYmJhDVDLhAfAb8ea5tNhuqqqp0e/+aN29e5zoys8jIyEBXgSiscKoeUYjbv38/Onbs6LLhnZKSUuexJUuWoFevXoiLi0NiYiIuu+wyfPbZZ/bnP/roI4wYMQLNmjVDdHQ0MjMzMXv2bFRXV9vLXH755fjkk09w6NAh+1SX9PR0fPHFF+jZsyeAmkab+pzjNKVvvvkGw4YNQ0JCAuLi4jBgwAB89dVXTnWcOXMmLBYLdu/ejRtvvBGJiYm49NJLpc/NwIEDAdQEGp6sXbsW/fv3R/369dGoUSNcffXV2LNnj1N9HnroIQBARkaG/XUdPHjQ43Hfe+89dO/eHbGxsUhOTsZNN92EI0eO2J+//PLLMWHCBABAz549YbFYhEYl3nzzTSQmJmLEiBG47rrr8Oabb7osV1RUhJtvvhnx8fFo1KgRJkyYgB07drhc57V3715cd911SEpKQkxMDHr06IHly5d7rMe5c+eQlJSEW265pc5zZWVliImJwV/+8hf7Yy+++CI6duxov/Z69OiBt956y+vrdad169ZYtGgRqqqq8NRTT9kfd7XGacOGDfjjH/+IVq1aITo6Gi1btsQDDzyAs2fPujz2gQMHMHToUNSvXx/NmjXD448/DkVRnMrYbDY8//zz6NixI2JiYtC0aVPcfvvtOHnypL1Meno6fvjhB3z55Zf26+byyy+3P19SUoL7778fLVu2RHR0NNq2bYu5c+fWGWVYunQpunfvjoYNGyI+Ph6dOnXCCy+84PUcufoMrFq1yn69N2zYECNGjMAPP/xQ53c//PBDZGdnIyYmBtnZ2fjggw9c/g1Xa5y++OIL9OjRAzExMcjMzMQrr7xi/1zX/t27774bb775Jjp27Ijo6Gjk5uYCAI4cOYJbb70VTZs2RXR0NDp27IjXXnutzt+vrKzEjBkz0LZtW/t7+/DDD6OystLr+XHkqn4AsGjRojqf9/T0dPzhD3/Axo0b0atXL8TExKBNmzb473//W+f3S0pK8MADDyA9PR3R0dFo0aIFxo8fjxMnTni9Z7pa41ReXo4HH3zQfs1ccskl+Ne//lXn+lTPrfo+qudQPb9EVBdHnIhCXOvWrZGXl4ddu3YhOzvbY9lZs2Zh5syZ6Nu3Lx5//HFERUXhm2++wdq1azFkyBAANY2EBg0aYOrUqWjQoAHWrl2L6dOno6ysDE8//TQA4G9/+xtKS0vx66+/4rnnngMANGjQAB06dMDjjz+O6dOn489//jP69+8PAOjbty+AmgBl+PDh6N69O2bMmAGr1YrXX38dAwcOxIYNG9CrVy+n+v7xj39Eu3bt8MQTT9RpFIjYv38/AKBx48Zuy3z++ecYPnw42rRpg5kzZ+Ls2bN48cUX0a9fP2zduhXp6ekYM2YMfvrpJ7z99tt47rnnkJycDABo0qSJ2+MuWrQIt9xyC3r27Ik5c+bg2LFjeOGFF/DVV19h27ZtaNSoEf72t7/hkksuwYIFC/D4448jIyMDmZmZXl/Xm2++iTFjxiAqKgrjxo3DvHnzsGXLFnsDDKhp1I8cORKbN2/GlClT0L59e3z00Uf2QM3RDz/8gH79+qF58+Z49NFHUb9+fbz77rsYPXo03n//fVxzzTUu6xEZGYlrrrkGy5YtwyuvvOI03enDDz9EZWWlfQrdwoULce+99+K6667Dfffdh4qKCnz//ff45ptvcOONN3p9ze7k5OQgMzMTq1ev9ljuvffew5kzZzBlyhQ0btwYmzdvxosvvohff/0V7733nlPZ6upqDBs2DH369MFTTz2F3NxczJgxA+fPn8fjjz9uL3f77bfb3+d7770X+fn5eOmll7Bt2zZ89dVXiIyMxPPPP4977rkHDRo0wN/+9jcAQNOmTQEAZ86cwYABA3DkyBHcfvvtaNWqFb7++mtMmzYNBQUF9jWEq1evxrhx43DllVdi7ty5AIA9e/bgq6++wn333efxddf+DCxevBgTJkzA0KFDMXfuXJw5cwbz5s3DpZdeim3bttkb6Z999hmuvfZaZGVlYc6cOSgqKsItt9yCFi1aeH1Ptm3bhmHDhiEtLQ2zZs1CdXU1Hn/8cbefl7Vr1+Ldd9/F3XffjeTkZKSnp+PYsWPo06ePvfHfpEkTrFq1CpMmTUJZWZl9irDNZsOoUaOwceNG/PnPf0aHDh2wc+dOPPfcc/jpp5/w4YcfOv2tc+fO4cSJE06PxcXFIS4uzuvrqu3nn3/Gddddh0mTJmHChAl47bXXMHHiRHTv3h0dO3YEAJw+fRr9+/fHnj17cOutt6Jbt244ceIEli9fjl9//dXrPbM2RVEwatQorFu3DpMmTUKXLl3w6aef4qGHHsKRI0fs92PVxo0bsWzZMtx5551o2LAh/v3vf+Paa6/F4cOHPd4XicKWQkQh7bPPPlMiIiKUiIgIJScnR3n44YeVTz/9VKmqqnIqt2/fPsVqtSrXXHONUl1d7fSczWaz///MmTN1/sbtt9+uxMXFKRUVFfbHRowYobRu3bpO2S1btigAlNdff73O32jXrp0ydOjQOn8vIyNDGTx4sP2xGTNmKACUcePGCZ2DdevWKQCU1157Tfntt9+Uo0ePKp988omSnp6uWCwWZcuWLYqiKEp+fn6dunXp0kVJSUlRioqK7I/t2LFDsVqtyvjx4+2PPf300woAJT8/32t9qqqqlJSUFCU7O1s5e/a
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Выбросы:\n",
|
|||
|
" Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
|
|||
|
"4 0 137 40 35 168 43.1 \n",
|
|||
|
"12 10 139 80 0 0 27.1 \n",
|
|||
|
"39 4 111 72 47 207 37.1 \n",
|
|||
|
"45 0 180 66 39 0 42.0 \n",
|
|||
|
"58 0 146 82 0 0 40.5 \n",
|
|||
|
"100 1 163 72 0 0 39.0 \n",
|
|||
|
"147 2 106 64 35 119 30.5 \n",
|
|||
|
"187 1 128 98 41 58 32.0 \n",
|
|||
|
"218 5 85 74 22 0 29.0 \n",
|
|||
|
"228 4 197 70 39 744 36.7 \n",
|
|||
|
"243 6 119 50 22 176 27.1 \n",
|
|||
|
"245 9 184 85 15 0 30.0 \n",
|
|||
|
"259 11 155 76 28 150 33.3 \n",
|
|||
|
"292 2 128 78 37 182 43.3 \n",
|
|||
|
"308 0 128 68 19 180 30.5 \n",
|
|||
|
"330 8 118 72 19 0 23.1 \n",
|
|||
|
"370 3 173 82 48 465 38.4 \n",
|
|||
|
"371 0 118 64 23 89 0.0 \n",
|
|||
|
"383 1 90 62 18 59 25.1 \n",
|
|||
|
"395 2 127 58 24 275 27.7 \n",
|
|||
|
"445 0 180 78 63 14 59.4 \n",
|
|||
|
"534 1 77 56 30 56 33.3 \n",
|
|||
|
"593 2 82 52 22 115 28.5 \n",
|
|||
|
"606 1 181 78 42 293 40.0 \n",
|
|||
|
"618 9 112 82 24 0 28.2 \n",
|
|||
|
"621 2 92 76 20 0 24.2 \n",
|
|||
|
"622 6 183 94 0 0 40.8 \n",
|
|||
|
"659 3 80 82 31 70 34.2 \n",
|
|||
|
"661 1 199 76 43 0 42.9 \n",
|
|||
|
"\n",
|
|||
|
" DiabetesPedigreeFunction Age Outcome \n",
|
|||
|
"4 2.288 33 1 \n",
|
|||
|
"12 1.441 57 0 \n",
|
|||
|
"39 1.390 56 1 \n",
|
|||
|
"45 1.893 25 1 \n",
|
|||
|
"58 1.781 44 0 \n",
|
|||
|
"100 1.222 33 1 \n",
|
|||
|
"147 1.400 34 0 \n",
|
|||
|
"187 1.321 33 1 \n",
|
|||
|
"218 1.224 32 1 \n",
|
|||
|
"228 2.329 31 0 \n",
|
|||
|
"243 1.318 33 1 \n",
|
|||
|
"245 1.213 49 1 \n",
|
|||
|
"259 1.353 51 1 \n",
|
|||
|
"292 1.224 31 1 \n",
|
|||
|
"308 1.391 25 1 \n",
|
|||
|
"330 1.476 46 0 \n",
|
|||
|
"370 2.137 25 1 \n",
|
|||
|
"371 1.731 21 0 \n",
|
|||
|
"383 1.268 25 0 \n",
|
|||
|
"395 1.600 25 0 \n",
|
|||
|
"445 2.420 25 1 \n",
|
|||
|
"534 1.251 24 0 \n",
|
|||
|
"593 1.699 25 0 \n",
|
|||
|
"606 1.258 22 1 \n",
|
|||
|
"618 1.282 50 1 \n",
|
|||
|
"621 1.698 28 0 \n",
|
|||
|
"622 1.461 45 0 \n",
|
|||
|
"659 1.292 27 1 \n",
|
|||
|
"661 1.394 22 1 \n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA04AAAIjCAYAAAA0vUuxAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAADDWUlEQVR4nOzdeXhU1f0/8PdMSAIBEhICJGxJCFZWAUEgAlqRTSlI1a+ILYLSuC8Vq0hbWbSKuPtTKxq1Km5YpUoLxoWlgAahICiCiiGAYgKSAAGChGTu7494h5nMds6duXfOnXm/nsfnkcnJ5Myd7XzO+ZzPcWiapoGIiIiIiIgCcka7A0RERERERKpj4ERERERERBQCAyciIiIiIqIQGDgRERERERGFwMCJiIiIiIgoBAZOREREREREITBwIiIiIiIiCoGBExERERERUQgMnIiIiIiIiEJg4ERE9Itdu3bB4XDgpZdeinZXvBQXF6Nv375o2rQpHA4HDh06FO0u2dqqVavgcDiwatUq6d+dOnUqWrRoEflOxYipU6ciNzfX6zaHw4E5c+ZEpT+xzt/1JiLzMHAiigNffvklLr30UuTk5KBp06bo0KEDRo4ciSeffNK0v/n666/j8ccf97n9xx9/xJw5c7B582bT/nZj+kBZ/y8xMRFdunTBlVdeiZ07d0bkb3z66aeYM2dOxIOayspKXHbZZWjWrBmefvppLFy4EM2bNw/5e3//+9/hcDgwaNCgiPZHNXqw6/ncZmZm4uyzz8af//xn7NmzJ9pdDOnvf/+76cG6Fe8Bu8vNzfW6Rp7//fzzz1HrVzQ+M4nIvybR7gARmevTTz/Feeedh86dO6OwsBBZWVn4/vvvsW7dOjzxxBO4+eabTfm7r7/+OrZu3Yo//vGPXrf/+OOPmDt3LnJzc9G3b19T/nYgt9xyC8466yycPHkSmzZtwnPPPYelS5fiyy+/RPv27cO6708//RRz587F1KlT0apVq8h0GMCGDRtw5MgR3HvvvRgxYoTw77322mvIzc3F+vXr8d1336Fr164R65OKJk2ahAsvvBAulwsHDx7Ehg0b8Pjjj+OJJ57ACy+8gMsvv9zd9pxzzsHx48eRlJQUxR6f8ve//x2ZmZmYOnWq6X/LzPdAIMePH0eTJvYYbvTt2xe33367z+3RfK0E+8wsKiqCy+WKTseI4pA9PsmIyLD77rsPaWlp2LBhg8+Afv/+/dHplAmOHTsWciVm2LBhuPTSSwEAV111FX71q1/hlltuwcsvv4yZM2da0U1p+nMkE4yVlZXh008/xeLFi3Httdfitddew+zZs03qoRrOPPNM/P73v/e6bffu3Rg1ahSmTJmC7t27o0+fPgAAp9OJpk2bRqObUReN90Akr7XL5UJtba1pz1+HDh18XkcqS0xMjHYXiOIKU/WIYlxpaSl69uzpd+Ddtm1bn9teffVVDBw4ECkpKUhPT8c555yDDz/80P3z9957D2PHjkX79u2RnJyM/Px83Hvvvaivr3e3+fWvf42lS5di9+7d7lSX3NxcrFq1CmeddRaAhkGb/jPPNKXPPvsMY8aMQVpaGlJSUnDuuefik08+8erjnDlz4HA4sG3bNlxxxRVIT0/H0KFDpa/N8OHDATQEGsGsWLECw4YNQ/PmzdGqVStcdNFF2L59u1d/7rjjDgBAXl6e+3Ht2rUr6P3+85//RP/+/dGsWTNkZmbi97//Pfbu3ev++a9//WtMmTIFAHDWWWfB4XAIrUq89tprSE9Px9ixY3HppZfitdde89uusrISkydPRmpqKlq1aoUpU6Zgy5Ytfvd5ff3117j00kuRkZGBpk2bYsCAAViyZEnQfpw8eRIZGRm46qqrfH5WXV2Npk2b4k9/+pP7tieffBI9e/Z0v/YGDBiA119/PeTjDSQnJwcvvfQSamtr8eCDD7pv97fHac2aNfi///s/dO7cGcnJyejUqRNuu+02HD9+3O9979y5E6NHj0bz5s3Rvn173HPPPdA0zauNy+XC448/jp49e6Jp06Zo164drr32Whw8eNDdJjc3F1999RX++9//ul83v/71r90/P3ToEP74xz+iU6dOSE5ORteuXTF//nyfVYY333wT/fv3R8uWLZGamorevXvjiSeeCHmN/L0H3n//fffrvWXLlhg7diy++uorn99999130atXLzRt2hS9evXCv/71L79/w98ep1WrVmHAgAFo2rQp8vPz8eyzz7rf141/96abbsJrr72Gnj17Ijk5GcXFxQCAvXv34uqrr0a7du2QnJyMnj174sUXX/T5+ydOnMDs2bPRtWtX93N755134sSJEyGvjyd//QOAl156yef9npubi9/85jdYu3YtBg4ciKZNm6JLly545ZVXfH7/0KFDuO2225Cbm4vk5GR07NgRV155JQ4cOBDyM9PfHqdjx47h9ttvd79mTj/9dDz88MM+r0/92urPo34N9etLRL644kQU43JyclBSUoKtW7eiV69eQdvOnTsXc+bMwdlnn4177rkHSUlJ+Oyzz7BixQqMGjUKQMMgoUWLFpg+fTpatGiBFStWYNasWaiursZDDz0EAPjLX/6Cw4cP44cffsBjjz0GAGjRogW6d++Oe+65B7NmzcI111yDYcOGAQDOPvtsAA0BygUXXID+/ftj9uzZcDqd+Mc//oHhw4djzZo1GDhwoFd//+///g+nnXYa7r//fp9BgYjS0lIAQOvWrQO2+fjjj3HBBRegS5cumDNnDo4fP44nn3wSQ4YMwaZNm5Cbm4uLL74Y3377Ld544w089thjyMzMBAC0adMm4P2+9NJLuOqqq3DWWWdh3rx52LdvH5544gl88skn+Pzzz9GqVSv85S9/wemnn47nnnsO99xzD/Ly8pCfnx/ycb322mu4+OKLkZSUhEmTJuGZZ57Bhg0b3AMwoGFQP27cOKxfvx7XX389unXrhvfee88dqHn66quvMGTIEHTo0AF33XUXmjdvjrfeegsTJkzAO++8g9/+9rd++5GYmIjf/va3WLx4MZ599lmvdKd3330XJ06ccKfQFRUV4ZZbbsGll16KW2+9FT///DO++OILfPbZZ7jiiitCPuZACgoKkJ+fj48++ihou3/+85+oqanB9ddfj9atW2P9+vV48skn8cMPP+Cf//ynV9v6+nqMGTMGgwcPxoMPPoji4mLMnj0bdXV1uOeee9ztrr32WvfzfMstt6CsrAxPPfUUPv/8c3zyySdITEzE448/jptvvhktWrTAX/7yFwBAu3btAAA1NTU499xzsXfvXlx77bXo3LkzPv30U8ycORPl5eXuPYQfffQRJk2ahPPPPx/z588HAGzfvh2ffPIJbr311qCPu/F7YOHChZgyZQpGjx6N+fPno6amBs888wyGDh2Kzz//3D1I//DDD3HJJZegR48emDdvHiorK3HVVVehY8eOIZ+Tzz//HGPGjEF2djbmzp2L+vp63HPPPQHfLytWrMBbb72Fm266CZmZmcjNzcW+ffswePBg9+C/TZs2eP/99zFt2jRUV1e7U4RdLhfGjx+PtWvX4pprrkH37t3x5Zdf4rHHHsO3336Ld9991+tvnTx5EgcOHPC6LSUlBSkpKSEfV2PfffcdLr30UkybNg1TpkzBiy++iKlTp6J///7o2bMnAODo0aMYNmwYtm/fjquvvhpnnnkmDhw4gCVLluCHH34I+ZnZmKZpGD9+PFauXIlp06ahb9+++OCDD3DHHXdg79697s9j3dq1a7F48WLccMMNaNmyJf7f//t/uOSSS7Bnz56gn4tEcUsjopj24YcfagkJCVpCQoJWUFCg3XnnndoHH3yg1dbWerXbsWOH5nQ6td/+9rdafX29189cLpf7/2tqanz+xrXXXqulpKRoP//8s/u2sWPHajk5OT5tN2zYoAHQ/vGPf/j8jdNOO00bPXq0z9/Ly8vTRo4c6b5t9uzZGgBt0qRJQtdg5cqVGgDtxRdf1H766Sftxx9/1JYuXarl5uZqDodD27Bhg6ZpmlZWVubTt759+2pt27bVKisr3bdt2bJFczqd2pVXXum
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Имеется смещение в меньшую сторону\n",
|
|||
|
"df_cleaned = df.dropna()\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"plt.scatter(df[\"Age\"], df[\"DiabetesPedigreeFunction\"])\n",
|
|||
|
"plt.xlabel(\"Age\")\n",
|
|||
|
"plt.ylabel(\"DiabetesPedigreeFunction\")\n",
|
|||
|
"plt.title(\"Scatter Plot of Age vs DiabetesPedigreeFunction\")\n",
|
|||
|
"plt.show()\n",
|
|||
|
"\n",
|
|||
|
"# Статистический анализ для определения выбросов\n",
|
|||
|
"Q1 = df[\"DiabetesPedigreeFunction\"].quantile(0.25)\n",
|
|||
|
"Q3 = df[\"DiabetesPedigreeFunction\"].quantile(0.75)\n",
|
|||
|
"IQR = Q3 - Q1\n",
|
|||
|
"\n",
|
|||
|
"# Определение порога для выбросов\n",
|
|||
|
"threshold = 1.5 * IQR\n",
|
|||
|
"outliers = (df[\"DiabetesPedigreeFunction\"] < (Q1 - threshold)) | (\n",
|
|||
|
" df[\"DiabetesPedigreeFunction\"] > (Q3 + threshold)\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"# Вывод выбросов\n",
|
|||
|
"print(\"Выбросы:\")\n",
|
|||
|
"print(df[outliers])\n",
|
|||
|
"\n",
|
|||
|
"# Обработка выбросов\n",
|
|||
|
"# В данном случае мы уберем выбросы\n",
|
|||
|
"median_charge = df[\"DiabetesPedigreeFunction\"].median()\n",
|
|||
|
"df.loc[outliers, \"DiabetesPedigreeFunction\"] = 0\n",
|
|||
|
"df = df[df.DiabetesPedigreeFunction != 0]\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация данных после обработки\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"plt.scatter(df[\"Age\"], df[\"DiabetesPedigreeFunction\"])\n",
|
|||
|
"plt.xlabel(\"Age\")\n",
|
|||
|
"plt.ylabel(\"DiabetesPedigreeFunction\")\n",
|
|||
|
"plt.title(\"Scatter Plot of Age vs DiabetesPedigreeFunction\")\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Разбиение набора данных на обучающую, контрольную и тестовую выборки"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 31,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки: 460\n",
|
|||
|
"Размер контрольной выборки: 154\n",
|
|||
|
"Размер тестовой выборки: 154\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"\n",
|
|||
|
"# Разделение на обучающую и тестовую выборки\n",
|
|||
|
"train_df, test_df = train_test_split(df_cleaned, test_size=0.2, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение обучающей выборки на обучающую и контрольную\n",
|
|||
|
"train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"print(\"Размер обучающей выборки:\", len(train_df))\n",
|
|||
|
"print(\"Размер контрольной выборки:\", len(val_df))\n",
|
|||
|
"print(\"Размер тестовой выборки:\", len(test_df))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Видим недостаток баланса:"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 32,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение Age в обучающей выборке:\n",
|
|||
|
"Age\n",
|
|||
|
"22 39\n",
|
|||
|
"21 36\n",
|
|||
|
"25 31\n",
|
|||
|
"24 25\n",
|
|||
|
"26 23\n",
|
|||
|
"27 22\n",
|
|||
|
"28 21\n",
|
|||
|
"23 21\n",
|
|||
|
"41 17\n",
|
|||
|
"31 15\n",
|
|||
|
"29 15\n",
|
|||
|
"37 14\n",
|
|||
|
"30 12\n",
|
|||
|
"45 11\n",
|
|||
|
"46 11\n",
|
|||
|
"36 11\n",
|
|||
|
"42 10\n",
|
|||
|
"34 10\n",
|
|||
|
"38 10\n",
|
|||
|
"32 9\n",
|
|||
|
"35 8\n",
|
|||
|
"52 8\n",
|
|||
|
"33 7\n",
|
|||
|
"43 7\n",
|
|||
|
"40 7\n",
|
|||
|
"47 5\n",
|
|||
|
"50 5\n",
|
|||
|
"51 5\n",
|
|||
|
"39 4\n",
|
|||
|
"48 4\n",
|
|||
|
"57 4\n",
|
|||
|
"66 3\n",
|
|||
|
"49 3\n",
|
|||
|
"44 3\n",
|
|||
|
"58 2\n",
|
|||
|
"55 2\n",
|
|||
|
"53 2\n",
|
|||
|
"63 2\n",
|
|||
|
"67 2\n",
|
|||
|
"54 2\n",
|
|||
|
"56 2\n",
|
|||
|
"60 2\n",
|
|||
|
"59 2\n",
|
|||
|
"61 1\n",
|
|||
|
"65 1\n",
|
|||
|
"81 1\n",
|
|||
|
"69 1\n",
|
|||
|
"68 1\n",
|
|||
|
"70 1\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение Age в контрольной выборке:\n",
|
|||
|
"Age\n",
|
|||
|
"22 20\n",
|
|||
|
"21 15\n",
|
|||
|
"24 11\n",
|
|||
|
"23 10\n",
|
|||
|
"25 9\n",
|
|||
|
"33 8\n",
|
|||
|
"26 7\n",
|
|||
|
"31 6\n",
|
|||
|
"30 5\n",
|
|||
|
"28 5\n",
|
|||
|
"27 5\n",
|
|||
|
"39 4\n",
|
|||
|
"29 4\n",
|
|||
|
"42 4\n",
|
|||
|
"40 4\n",
|
|||
|
"32 3\n",
|
|||
|
"37 2\n",
|
|||
|
"46 2\n",
|
|||
|
"51 2\n",
|
|||
|
"44 2\n",
|
|||
|
"34 2\n",
|
|||
|
"43 2\n",
|
|||
|
"41 2\n",
|
|||
|
"45 2\n",
|
|||
|
"54 2\n",
|
|||
|
"35 2\n",
|
|||
|
"58 1\n",
|
|||
|
"53 1\n",
|
|||
|
"61 1\n",
|
|||
|
"59 1\n",
|
|||
|
"64 1\n",
|
|||
|
"36 1\n",
|
|||
|
"50 1\n",
|
|||
|
"72 1\n",
|
|||
|
"49 1\n",
|
|||
|
"47 1\n",
|
|||
|
"66 1\n",
|
|||
|
"62 1\n",
|
|||
|
"69 1\n",
|
|||
|
"55 1\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение Age в тестовой выборке:\n",
|
|||
|
"Age\n",
|
|||
|
"22 13\n",
|
|||
|
"21 12\n",
|
|||
|
"24 10\n",
|
|||
|
"29 10\n",
|
|||
|
"28 9\n",
|
|||
|
"25 8\n",
|
|||
|
"23 7\n",
|
|||
|
"38 6\n",
|
|||
|
"27 5\n",
|
|||
|
"39 4\n",
|
|||
|
"30 4\n",
|
|||
|
"43 4\n",
|
|||
|
"42 4\n",
|
|||
|
"32 4\n",
|
|||
|
"36 4\n",
|
|||
|
"58 4\n",
|
|||
|
"31 3\n",
|
|||
|
"62 3\n",
|
|||
|
"44 3\n",
|
|||
|
"26 3\n",
|
|||
|
"41 3\n",
|
|||
|
"37 3\n",
|
|||
|
"60 3\n",
|
|||
|
"54 2\n",
|
|||
|
"50 2\n",
|
|||
|
"34 2\n",
|
|||
|
"65 2\n",
|
|||
|
"63 2\n",
|
|||
|
"45 2\n",
|
|||
|
"40 2\n",
|
|||
|
"33 2\n",
|
|||
|
"53 2\n",
|
|||
|
"55 1\n",
|
|||
|
"48 1\n",
|
|||
|
"67 1\n",
|
|||
|
"56 1\n",
|
|||
|
"51 1\n",
|
|||
|
"49 1\n",
|
|||
|
"57 1\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"def check_balance(df, name):\n",
|
|||
|
" counts = df['Age'].value_counts()\n",
|
|||
|
" print(f\"Распределение Age в {name}:\")\n",
|
|||
|
" print(counts)\n",
|
|||
|
" print()\n",
|
|||
|
"\n",
|
|||
|
"check_balance(train_df, \"обучающей выборке\")\n",
|
|||
|
"check_balance(val_df, \"контрольной выборке\")\n",
|
|||
|
"check_balance(test_df, \"тестовой выборке\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Используем oversample"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 33,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение Age в обучающей выборке после oversampling:\n",
|
|||
|
"Age\n",
|
|||
|
"26 39\n",
|
|||
|
"25 39\n",
|
|||
|
"33 39\n",
|
|||
|
"23 39\n",
|
|||
|
"35 39\n",
|
|||
|
"39 39\n",
|
|||
|
"29 39\n",
|
|||
|
"27 39\n",
|
|||
|
"21 39\n",
|
|||
|
"28 39\n",
|
|||
|
"41 39\n",
|
|||
|
"31 39\n",
|
|||
|
"24 39\n",
|
|||
|
"22 39\n",
|
|||
|
"42 39\n",
|
|||
|
"32 39\n",
|
|||
|
"36 39\n",
|
|||
|
"51 39\n",
|
|||
|
"34 39\n",
|
|||
|
"30 39\n",
|
|||
|
"45 39\n",
|
|||
|
"49 39\n",
|
|||
|
"57 39\n",
|
|||
|
"53 39\n",
|
|||
|
"48 39\n",
|
|||
|
"55 39\n",
|
|||
|
"46 39\n",
|
|||
|
"47 39\n",
|
|||
|
"52 39\n",
|
|||
|
"50 39\n",
|
|||
|
"37 39\n",
|
|||
|
"58 39\n",
|
|||
|
"61 39\n",
|
|||
|
"40 39\n",
|
|||
|
"65 39\n",
|
|||
|
"67 39\n",
|
|||
|
"38 39\n",
|
|||
|
"66 39\n",
|
|||
|
"81 39\n",
|
|||
|
"44 39\n",
|
|||
|
"43 39\n",
|
|||
|
"56 39\n",
|
|||
|
"54 39\n",
|
|||
|
"60 39\n",
|
|||
|
"69 39\n",
|
|||
|
"70 39\n",
|
|||
|
"68 39\n",
|
|||
|
"63 39\n",
|
|||
|
"59 39\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение Age в контрольной выборке после oversampling:\n",
|
|||
|
"Age\n",
|
|||
|
"25 20\n",
|
|||
|
"58 20\n",
|
|||
|
"27 20\n",
|
|||
|
"26 20\n",
|
|||
|
"31 20\n",
|
|||
|
"22 20\n",
|
|||
|
"28 20\n",
|
|||
|
"39 20\n",
|
|||
|
"21 20\n",
|
|||
|
"45 20\n",
|
|||
|
"41 20\n",
|
|||
|
"61 20\n",
|
|||
|
"51 20\n",
|
|||
|
"64 20\n",
|
|||
|
"36 20\n",
|
|||
|
"30 20\n",
|
|||
|
"53 20\n",
|
|||
|
"40 20\n",
|
|||
|
"23 20\n",
|
|||
|
"59 20\n",
|
|||
|
"29 20\n",
|
|||
|
"42 20\n",
|
|||
|
"46 20\n",
|
|||
|
"43 20\n",
|
|||
|
"44 20\n",
|
|||
|
"34 20\n",
|
|||
|
"24 20\n",
|
|||
|
"32 20\n",
|
|||
|
"33 20\n",
|
|||
|
"72 20\n",
|
|||
|
"35 20\n",
|
|||
|
"50 20\n",
|
|||
|
"54 20\n",
|
|||
|
"47 20\n",
|
|||
|
"49 20\n",
|
|||
|
"66 20\n",
|
|||
|
"62 20\n",
|
|||
|
"69 20\n",
|
|||
|
"55 20\n",
|
|||
|
"37 20\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение Age в тестовой выборке после oversampling:\n",
|
|||
|
"Age\n",
|
|||
|
"43 13\n",
|
|||
|
"21 13\n",
|
|||
|
"34 13\n",
|
|||
|
"50 13\n",
|
|||
|
"55 13\n",
|
|||
|
"22 13\n",
|
|||
|
"44 13\n",
|
|||
|
"37 13\n",
|
|||
|
"65 13\n",
|
|||
|
"40 13\n",
|
|||
|
"60 13\n",
|
|||
|
"29 13\n",
|
|||
|
"28 13\n",
|
|||
|
"24 13\n",
|
|||
|
"36 13\n",
|
|||
|
"48 13\n",
|
|||
|
"45 13\n",
|
|||
|
"39 13\n",
|
|||
|
"41 13\n",
|
|||
|
"27 13\n",
|
|||
|
"23 13\n",
|
|||
|
"38 13\n",
|
|||
|
"25 13\n",
|
|||
|
"67 13\n",
|
|||
|
"31 13\n",
|
|||
|
"30 13\n",
|
|||
|
"54 13\n",
|
|||
|
"33 13\n",
|
|||
|
"62 13\n",
|
|||
|
"26 13\n",
|
|||
|
"56 13\n",
|
|||
|
"58 13\n",
|
|||
|
"53 13\n",
|
|||
|
"63 13\n",
|
|||
|
"42 13\n",
|
|||
|
"32 13\n",
|
|||
|
"51 13\n",
|
|||
|
"49 13\n",
|
|||
|
"57 13\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from imblearn.over_sampling import RandomOverSampler\n",
|
|||
|
"\n",
|
|||
|
"def oversample(df):\n",
|
|||
|
" X = df.drop('Age', axis=1)\n",
|
|||
|
" y = df['Age']\n",
|
|||
|
" \n",
|
|||
|
" oversampler = RandomOverSampler(random_state=42)\n",
|
|||
|
" X_resampled, y_resampled = oversampler.fit_resample(X, y) # type: ignore\n",
|
|||
|
" \n",
|
|||
|
" resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
|
|||
|
" return resampled_df\n",
|
|||
|
"\n",
|
|||
|
"train_df_oversampled = oversample(train_df)\n",
|
|||
|
"val_df_oversampled = oversample(val_df)\n",
|
|||
|
"test_df_oversampled = oversample(test_df)\n",
|
|||
|
"\n",
|
|||
|
"check_balance(train_df_oversampled, \"обучающей выборке после oversampling\")\n",
|
|||
|
"check_balance(val_df_oversampled, \"контрольной выборке после oversampling\")\n",
|
|||
|
"check_balance(test_df_oversampled, \"тестовой выборке после oversampling\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Diamonds Prices2022"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"В данном наборе данных представлена цена на алмазы. Входные данные: цвет, караты, цена, чистота, размеры\n",
|
|||
|
"Цель: узнать, какие алмазы ценятся больше всего"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 34,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Index(['Unnamed: 0', 'carat', 'cut', 'color', 'clarity', 'depth', 'table',\n",
|
|||
|
" 'price', 'x', 'y', 'z'],\n",
|
|||
|
" dtype='object')\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"df = pd.read_csv(\".//static//csv//Diamonds Prices2022.csv\")\n",
|
|||
|
"print(df.columns)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 35,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAx8AAAIjCAYAAABia6bHAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAsLUlEQVR4nO3deZTXdd3//8fMIIs6DBeC4CQiuG8jKegxtUtLyzWzNDVKze1kmmtaHBWwy6wrvC6t3FvEvlZ2hVu573K8wD0SLddQKxBFZVUWmc/vD38zFyPbgONrGLjdzpkj8/m8l+f7M5/jmft83u/Pp6pSqVQCAADwMatu7wEAAIA1g/gAAACKEB8AAEAR4gMAAChCfAAAAEWIDwAAoAjxAQAAFCE+AACAIsQHAABQhPgAAACKEB8Aq5FXXnklVVVVGT16dHuPAgCLER8ASzB69OhUVVW1+Fp//fWz55575o477ig+z4MPPthilrXWWisDBw7MkUcemb///e9tso9x48Zl5MiRmT59eptsj+Tdd9/NyJEj8+CDD7b3KACrhE7tPQDAquz73/9+BgwYkEqlkqlTp2b06NHZb7/98qc//SkHHHBA8XlOOeWUDBkyJAsWLMhTTz2Vq6++OrfddlsmTpyY+vr6j7TtcePG5fzzz8/RRx+dHj16tM3Aa7h33303559/fpJkjz32aN9hAFYB4gNgGfbdd98MHjy4+ftjjz02ffr0ye9+97t2iY/dd989hxxySJLkG9/4RjbffPOccsopufbaazNs2LDi86xp3n///TQ2NqZz587tPQpAh+S0K4AV0KNHj3Tr1i2dOrX8282cOXNy5plnpl+/funSpUu22GKLXHTRRalUKkmS9957L1tuuWW23HLLvPfee83rvf3229lggw3yqU99KgsXLlzheT7zmc8kSSZNmrTM5e6///7svvvuWWedddKjR48cdNBB+dvf/tZ8/8iRI3PWWWclSQYMGNB8etcrr7yy3Bk+fHpa09eSTjU6+uijl7jsyJEjWyw3ZsyYDB48OLW1tS2Wu+iii5Y7z/Tp03P66adn4403TpcuXbLhhhvmyCOPzLRp05Ik8+fPz/Dhw7Pjjjumrq4u66yzTnbfffc88MADLbbTdP3MRRddlEsuuSSbbLJJunTpkr/+9a+t2sYrr7yS3r17J0nOP//8pR4rwJrEKx8AyzBjxoxMmzYtlUolb7zxRn72s59l9uzZ+drXvta8TKVSyRe+8IU88MADOfbYYzNo0KDcddddOeuss/Kvf/0rF198cbp165Zrr702u+66a84555z893//d5LkpJNOyowZMzJ69OjU1NSs8Hwvv/xykmS99dZb6jL33ntv9t133wwcODAjR47Me++9l5/97GfZdddd89RTT2XjjTfOl770pbzwwgv53e9+l4svvji9evVKkuZfnpdn7733zpFHHpkkefzxx/PTn/50qcv26tUrF198cfP3X//611vcP378+HzlK1/J9ttvnx/96Eepq6vLtGnTcvrppy93jtmzZ2f33XfP3/72txxzzDHZYYcdMm3atPzxj3/MP//5z/Tq1SszZ87ML37xixxxxBE5/vjjM2vWrPzyl7/M5z//+Tz22GMZNGhQi21ec801mTt3bk444YR06dIlPXv2bNU2evfunSuuuCInnnhiDj744HzpS19KkjQ0NLTqMQVYLVUAWMw111xTSbLYV5cuXSqjR49usezNN99cSVK54IILWtx+yCGHVKqqqiovvfRS823Dhg2rVFdXV8aOHVv5wx/+UElSueSSS5Y7zwMPPFBJUvnVr35VefPNNyuTJ0+u3HbbbZWNN964UlVVVXn88ccrlUqlMmnSpEqSyjXXXNO87qBBgyrrr79+5a233mq+7S9/+Uulurq6cuSRRzbfNmrUqEqSyqRJk1r9OM2fP7+SpHLyySc339Z0XA888MBiyw8dOrQyYMCAFrclqYwYMaL5+2HDhlWSVKZMmdJ8W9NxjRo1apnzDB8+vJKkcuONNy52X2NjY6VSqVTef//9yrx581rc984771T69OlTOeaYYxbbZ/fu3StvvPFGi+Vbu40333xzseMDWJN55QNgGS677LJsvvnmSZKpU6fmuuuuy3HHHZfa2trmv2TffvvtqampySmnnNJi3TPPPDNjxozJHXfckZNPPjnJB6c33XrrrTnqqKMye/bs/Pu///ti6y3LMccc0+L73r1759prr21xXcqipkyZkgkTJuTss89Oz549m29vaGjI3nvvndtvv73V+16SuXPnJkm6du3aquXnz5+fLl26LHOZWbNmpbq6eqUuer/hhhuy/fbb5+CDD17svqqqqiRJTU1N86tMjY2NmT59ehobGzN48OA89dRTi6335S9/ebFXgFZ0GwB8wDUfAMuw0047Za+99spee+2VoUOH5rbbbsvWW2+dk08+OfPnz0+SvPrqq6mvr09tbW2Ldbfaaqvm+5t07tw5v/rVrzJp0qTMmjUr11xzTfMvxa0xfPjw3HPPPbn//vvz9NNPZ/LkyYudtrSopn1vscUWi9231VZbZdq0aZkzZ06r9/9hTddR1NXVtWr56dOnZ911113mMrvssksaGxtz6qmn5uWXX860adPyzjvvtGr7L7/8crbddtvlLnfttdemoaEhXbt2zXrrrZfevXvntttuy4wZMxZbdsCAAR95GwB8QHwArIDq6ursueeemTJlSl588cWV2sZdd92V5INXDVZ0G9ttt1322muv7Lnnntluu+0Wu/C9tKYL0jfeeONWLf/666+nb9++y1zm8MMPz5lnnpnRo0dn0003Te/evbPDDjt8xEn/z3XXXZejjz46m2yySX75y1/mzjvvzD333JPPfOYzaWxsXGz5bt26feRtAPABp10BrKD3338/yQcXNydJ//79c++992bWrFktXv147rnnmu9v8vTTT+f73/9+vvGNb2TChAk57rjjMnHixFa/crCimvb9/PPPL3bfc889l169emWdddZJkhV6BabJE088kSRLPe1rUQsWLMhLL72UffbZZ5nLVVdX56KLLsrEiRMzadKkXH755Zk6dWqLi/yXZpNNNskzzzyzzGXGjBmTgQMH5sYbb2xxzCNGjFju9ld0GyvzmAKszrzyAbACFixYkLvvvjudO3duPq1qv/32y8KFC3PppZe2WPbiiy9OVVVV9t133+Z1jz766NTX1+cnP/lJRo8enalTp7bqXZxW1gYbbJBBgwbl2muvbfHJ5c8880zuvvvu7Lfffs23NUXIinzC+ZgxY7LFFltkyy23XO6yt9xyS957773mtwdelp/97Ge5//7785vf/CZ77bVXdt1111bN8+Uvfzl/+ctfctNNNy12X+X/f9vjpms1mr5PkkcffTTjx49v1T5WZBtrr712khV7TAFWZ175AFiGO+64o/kVjDfeeCO//e1v8+KLL+Z73/teunfvniQ58MADs+eee+acc87JK6+8ku233z533313brnllpx22mnZZJNNkiQXXHBBJkyYkPvuuy+1tbVpaGjI8OHDc+655+aQQw5pEQJtadSoUdl3332zyy675Nhjj21+q926uroWnzmx4447JknOOeecHH744VlrrbVy4IEHNkfJov7+97/nxz/+cR577LF86UtfynXXXdd83+OPP54kueeee7LRRhulb9++GTFiRC6//PJ86lOfyuc+97llzvvss8/m7LPPzsiRIzNkyJAVOtazzjorY8aMyaGHHppjjjkmO+64Y95+++388Y9/zJVXXpntt98+BxxwQG688cYcfPDB2X///TNp0qRceeWV2XrrrZtfzVqe1m6jW7du2XrrrfP73/8+m2++eXr27Jltt922VdelAKyW2vndtgBWSUt6q92uXbtWBg0aVLniiiua37a1yaxZsyqnn356pb6+vrLWWmtVNttss8qoUaOal3vyyScrnTp1qnz7299usd77779fGTJkSKW+vr7yzjvvLHWeprfa/cMf/rDMuZf0VruVSqVy7733VnbddddKt27dKt27d68ceOCBlb/+9a+Lrf8f//EflU9
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"sns.boxplot(x=df['carat'])\n",
|
|||
|
"plt.title('Box Plot для carat')\n",
|
|||
|
"plt.xlabel('carat')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 36,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA2QAAAIjCAYAAABswtioAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAADIqElEQVR4nOzdeXhTZfo38G9SukM3KqRFKGUTSoECslQWFUEKjIi4DKsb4oCigr4OwygMiIo78hNHFBEdEVBHBRVEy+KwFRBKgVpUqC0otCBdoaVrzvtHPSFJs5xzcpKcpN/PdXFpk5OTJ2naPvd57ue+dYIgCCAiIiIiIiKP03t7AERERERERE0VAzIiIiIiIiIvYUBGRERERETkJQzIiIiIiIiIvIQBGRERERERkZcwICMiIiIiIvISBmRERERERERewoCMiIiIiIjISxiQEREREREReQkDMiIi8gn5+fnQ6XR4//33vT0UC1u2bEFKSgpCQkKg0+lQWlrqtue699570b59e7edn4iIPI8BGRGRlx07dgx33HEHEhISEBISgjZt2mDEiBF444033Paca9euxeuvv97o9rNnz2LhwoXIyspy23Nb+/7776HT6Uz/AgMD0aFDB9x999349ddfVXmOvXv3YuHChaoHS0VFRbjrrrsQGhqKN998Ex9++CHCw8NtHvv+++9bvM6QkBB06dIFs2bNwrlz51Qdly/KysrClClT0LZtWwQHByMmJgbDhw/H6tWrUV9f75UxPf/889iwYYNXnpuImo5m3h4AEVFTtnfvXtx4441o164dpk+fDoPBgN9++w379u3DsmXL8Mgjj7jledeuXYvs7GzMnj3b4vazZ89i0aJFaN++PVJSUtzy3PY8+uij6NevH2pra5GZmYl33nkHmzZtwrFjxxAfH+/Suffu3YtFixbh3nvvRVRUlDoDBvDDDz/g4sWLWLx4MYYPHy7pMc888wwSExNRVVWF3bt346233sLmzZuRnZ2NsLAwh49duXIljEajGkPXlHfffRczZsxA69atMXXqVHTu3BkXL17Etm3bMG3aNBQUFOCf//ynx8f1/PPP44477sC4ceM8/txE1HQwICMi8qLnnnsOkZGR+OGHHxoFCufPn/fOoNygoqLC7sqRaMiQIbjjjjsAAPfddx+6dOmCRx99FB988AHmzZvniWHKJn6P5AR5o0aNwrXXXgsAeOCBB9CyZUu89tpr2LhxIyZOnGjzMeL7FxgY6PKYtWbfvn2YMWMGUlNTsXnzZrRo0cJ03+zZs3Hw4EFkZ2e7/DxGoxE1NTUICQlx+VxERGpiyiIRkRfl5uaie/fuNif0rVq1anTbmjVr0L9/f4SFhSE6OhpDhw7Fd999Z7p/48aNGDNmDOLj4xEcHIyOHTti8eLFFilfN9xwAzZt2oRTp06Z0ufat2+P77//Hv369QPQEBCJ95nv2dq/fz/S0tIQGRmJsLAwXH/99dizZ4/FGBcuXAidToecnBxMmjQJ0dHRGDx4sOz3ZtiwYQCAvLw8h8dt374dQ4YMQXh4OKKionDrrbfi+PHjFuN58sknAQCJiYmm15Wfn+/wvJ9++in69u2L0NBQxMbGYsqUKThz5ozp/htuuAH33HMPAKBfv37Q6XS49957XX6d9957L5o3b47c3FyMHj0aLVq0wOTJk033We8hMxqNWLZsGXr06IGQkBBcddVVSEtLw8GDBy2OW7Nmjen1xMTEYMKECfjtt98cju2///0vdDod/ve//zW67+2334ZOpzMFS4WFhbjvvvtw9dVXIzg4GHFxcbj11ludvs+LFi2CTqfDRx99ZBGMia699lqL9/WVV17Bddddh5YtWyI0NBR9+/bFf//730aP0+l0mDVrFj766CN0794dwcHB2LJli+Rz6HQ6VFRU4IMPPjB9ZpR8f4mInOEKGRGRFyUkJCAjIwPZ2dlITk52eOyiRYuwcOFCXHfddXjmmWcQFBSE/fv3Y/v27bj55psBNOxTat68OR5//HE0b94c27dvx4IFC1BeXo6XX34ZAPDUU0+hrKwMv//+O5YuXQoAaN68Obp164ZnnnkGCxYswIMPPoghQ4YAAK677joADYHPqFGj0LdvX/zrX/+CXq/H6tWrMWzYMOzatQv9+/e3GO+dd96Jzp074/nnn4cgCLLfm9zcXABAy5Yt7R6zdetWjBo1Ch06dMDChQtx+fJlvPHGGxg0aBAyMzPRvn17jB8/Hr/88gvWrVuHpUuXIjY2FgBw1VVX2T3v+++/j/vuuw/9+vXDkiVLcO7cOSxbtgx79uzB4cOHERUVhaeeegrXXHMN3nnnHVMaYseOHVV5nXV1dRg5ciQGDx6MV155xWEq47Rp0/D+++9j1KhReOCBB1BXV4ddu3Zh3759ppW45557DvPnz8ddd92FBx54AH/88QfeeOMNDB061PR6bBkzZgyaN2+OTz75BNdff73FfR9//DG6d+9u+tzefvvt+PHHH/HII4+gffv2OH/+PNLT03H69Gm7hUgqKyuxbds2DB06FO3atZP0fi1btgxjx47F5MmTUVNTg/Xr1+POO+/E119/jTFjxlgcu337dnzyySeYNWsWYmNjTeOQco4PP/wQDzzwAPr3748HH3wQABR9f4mInBKIiMhrvvvuOyEgIEAICAgQUlNThb///e/Ct99+K9TU1Fgcd+LECUGv1wu33XabUF9fb3Gf0Wg0/X9lZWWj5/jb3/4mhIWFCVVVVabbxowZIyQkJDQ69ocffhAACKtXr270HJ07dxZGjhzZ6PkSExOFESNGmG7717/+JQAQJk6cKOk92LFjhwBAeO+994Q//vhDOHv2rLBp0yahffv2gk6nE3744QdBEAQhLy+v0dhSUlKEVq1aCUVFRabbjhw5Iuj1euHuu+823fbyyy8LAIS8vDyn46mpqRFatWolJCcnC5cvXzbd/vXXXwsAhAULFphuW716tQDANEZHxGO3bt0q/PHHH8Jvv/0mrF+/XmjZsqUQGhoq/P7774IgCMI999wjABD+8Y9/NDrHPffcY/F92759uwBAePTRRxsdK36f8vPzhYCAAOG5556zuP/YsWNCs2bNGt1ubeLEiUKrVq2Euro6020FBQWCXq8XnnnmGUEQBKGkpEQAILz88stO3wdzR44cEQAIjz32mOTHWH/Ga2pqhOTkZGHYsGEWtwMQ9Hq98OOPPyo+R3h4uHDPPfdIHhsRkRJMWSQi8qIRI0YgIyMDY8eOxZEjR/DSSy9h5MiRaNOmDb788kvTcRs2bIDRaMSCBQug11v+6tbpdKb/Dw0NNf3/xYsXceHCBQwZMgSVlZX46aefFI8zKysLJ06cwKRJk1BUVIQLFy7gwoULqKiowE033YSdO3c2KjYxY8YMWc9x//3346qrrkJ8fDzGjBljShcTV3msFRQUICsrC/feey9iYmJMt/fs2RMjRozA5s2b5b9QAAcPHsT58+fx0EMPWew3GjNmDLp27YpNmzYpOq9o+PDhuOqqq9C2bVtMmDABzZs3xxdffIE2bdpYHDdz5kyn5/rss8+g0+nwr3/9q9F94ufi888/h9FoxF133WX6vl24cAEGgwGdO3fGjh07HD7HX//6V5w/fx7ff/+96bb//ve/MBqN+Otf/wqg4XMXFBSE77//HiUlJU7HLSovLwcAm6mK9ph/xktKSlBWVoYhQ4YgMzOz0bHXX389kpKSXDoHEZG7MWWRiMjL+vXrh88//xw1NTU4cuQIvvjiCyxduhR33HEHsrKykJSUhNzcXOj1epuTS3M//vgjnn76aWzfvt002RWVlZUpHuOJEycAwLRnypaysjJER0ebvk5MTJT1HAsWLMCQIUMQEBCA2NhYdOvWDc2a2f8zderUKQDANddc0+i+bt264dtvv5VUTETOebt27Yrdu3fLOp+1N998E126dEGzZs3QunVrXHPNNY2C7GbNmuHqq692eq7c3FzEx8dbBKTWTpw4AUEQ0LlzZ5v3OysUIu4Z/Pjjj3HTTTcBaEhXTElJQZcuXQAAwcHBePHFF/HEE0+gdevWGDhwIP7yl7/
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация данных после обработки\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"plt.scatter(df[\"carat\"], df[\"price\"])\n",
|
|||
|
"plt.xlabel(\"carat\")\n",
|
|||
|
"plt.ylabel(\"price\")\n",
|
|||
|
"plt.title(\"Scatter Plot of Price vs Carat\")\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Удаление строк с пустыми значениями"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 37,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"df_cleaned = df.dropna()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Разбиение набора данных на обучающую, контрольную и тестовую выборки"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Применение методов приращения данных (аугментации)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 38,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки: 32365\n",
|
|||
|
"Размер контрольной выборки: 10789\n",
|
|||
|
"Размер тестовой выборки: 10789\n",
|
|||
|
"Распределение price в обучающей выборке:\n",
|
|||
|
"price\n",
|
|||
|
"789 80\n",
|
|||
|
"605 79\n",
|
|||
|
"544 72\n",
|
|||
|
"552 72\n",
|
|||
|
"828 71\n",
|
|||
|
" ..\n",
|
|||
|
"9942 1\n",
|
|||
|
"7787 1\n",
|
|||
|
"18663 1\n",
|
|||
|
"7979 1\n",
|
|||
|
"8164 1\n",
|
|||
|
"Name: count, Length: 9496, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение price в контрольной выборке:\n",
|
|||
|
"price\n",
|
|||
|
"625 34\n",
|
|||
|
"828 31\n",
|
|||
|
"605 30\n",
|
|||
|
"789 26\n",
|
|||
|
"544 26\n",
|
|||
|
" ..\n",
|
|||
|
"4188 1\n",
|
|||
|
"7541 1\n",
|
|||
|
"3498 1\n",
|
|||
|
"3314 1\n",
|
|||
|
"12196 1\n",
|
|||
|
"Name: count, Length: 5383, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение price в тестовой выборке:\n",
|
|||
|
"price\n",
|
|||
|
"802 33\n",
|
|||
|
"844 29\n",
|
|||
|
"776 29\n",
|
|||
|
"675 26\n",
|
|||
|
"645 25\n",
|
|||
|
" ..\n",
|
|||
|
"1567 1\n",
|
|||
|
"5529 1\n",
|
|||
|
"2031 1\n",
|
|||
|
"417 1\n",
|
|||
|
"5431 1\n",
|
|||
|
"Name: count, Length: 5338, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение price в обучающей выборке после oversampling:\n",
|
|||
|
"price\n",
|
|||
|
"5076 80\n",
|
|||
|
"1789 80\n",
|
|||
|
"3931 80\n",
|
|||
|
"1263 80\n",
|
|||
|
"2026 80\n",
|
|||
|
" ..\n",
|
|||
|
"3678 80\n",
|
|||
|
"4592 80\n",
|
|||
|
"516 80\n",
|
|||
|
"7152 80\n",
|
|||
|
"2353 80\n",
|
|||
|
"Name: count, Length: 9496, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение price в контрольной выборке после oversampling:\n",
|
|||
|
"price\n",
|
|||
|
"966 34\n",
|
|||
|
"13638 34\n",
|
|||
|
"3669 34\n",
|
|||
|
"1052 34\n",
|
|||
|
"2818 34\n",
|
|||
|
" ..\n",
|
|||
|
"4032 34\n",
|
|||
|
"544 34\n",
|
|||
|
"3362 34\n",
|
|||
|
"6559 34\n",
|
|||
|
"792 34\n",
|
|||
|
"Name: count, Length: 5383, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение price в тестовой выборке после oversampling:\n",
|
|||
|
"price\n",
|
|||
|
"3742 33\n",
|
|||
|
"559 33\n",
|
|||
|
"8403 33\n",
|
|||
|
"1238 33\n",
|
|||
|
"1243 33\n",
|
|||
|
" ..\n",
|
|||
|
"1149 33\n",
|
|||
|
"2401 33\n",
|
|||
|
"958 33\n",
|
|||
|
"702 33\n",
|
|||
|
"14618 33\n",
|
|||
|
"Name: count, Length: 5338, dtype: int64\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from imblearn.over_sampling import RandomOverSampler\n",
|
|||
|
"\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"\n",
|
|||
|
"# Разделение на обучающую и тестовую выборки\n",
|
|||
|
"train_df, test_df = train_test_split(df_cleaned, test_size=0.2, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение обучающей выборки на обучающую и контрольную\n",
|
|||
|
"train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"print(\"Размер обучающей выборки:\", len(train_df))\n",
|
|||
|
"print(\"Размер контрольной выборки:\", len(val_df))\n",
|
|||
|
"print(\"Размер тестовой выборки:\", len(test_df))\n",
|
|||
|
"\n",
|
|||
|
"def check_balance(df, name):\n",
|
|||
|
" counts = df[\"price\"].value_counts()\n",
|
|||
|
" print(f\"Распределение price в {name}:\")\n",
|
|||
|
" print(counts)\n",
|
|||
|
" print()\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"check_balance(train_df, \"обучающей выборке\")\n",
|
|||
|
"check_balance(val_df, \"контрольной выборке\")\n",
|
|||
|
"check_balance(test_df, \"тестовой выборке\")\n",
|
|||
|
"\n",
|
|||
|
"def oversample(df):\n",
|
|||
|
" X = df.drop(\"price\", axis=1)\n",
|
|||
|
" y = df[\"price\"]\n",
|
|||
|
"\n",
|
|||
|
" oversampler = RandomOverSampler(random_state=42)\n",
|
|||
|
" X_resampled, y_resampled = oversampler.fit_resample(X, y) # type: ignore\n",
|
|||
|
"\n",
|
|||
|
" resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
|
|||
|
" return resampled_df\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"train_df_oversampled = oversample(train_df)\n",
|
|||
|
"val_df_oversampled = oversample(val_df)\n",
|
|||
|
"test_df_oversampled = oversample(test_df)\n",
|
|||
|
"\n",
|
|||
|
"check_balance(train_df_oversampled, \"обучающей выборке после oversampling\")\n",
|
|||
|
"check_balance(val_df_oversampled, \"контрольной выборке после oversampling\")\n",
|
|||
|
"check_balance(test_df_oversampled, \"тестовой выборке после oversampling\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 39,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение price в обучающей выборке после oversampling:\n",
|
|||
|
"price\n",
|
|||
|
"5076 80\n",
|
|||
|
"1789 80\n",
|
|||
|
"3931 80\n",
|
|||
|
"1263 80\n",
|
|||
|
"2026 80\n",
|
|||
|
" ..\n",
|
|||
|
"3678 80\n",
|
|||
|
"4592 80\n",
|
|||
|
"516 80\n",
|
|||
|
"7152 80\n",
|
|||
|
"2353 80\n",
|
|||
|
"Name: count, Length: 9496, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение price в контрольной выборке после oversampling:\n",
|
|||
|
"price\n",
|
|||
|
"966 34\n",
|
|||
|
"13638 34\n",
|
|||
|
"3669 34\n",
|
|||
|
"1052 34\n",
|
|||
|
"2818 34\n",
|
|||
|
" ..\n",
|
|||
|
"4032 34\n",
|
|||
|
"544 34\n",
|
|||
|
"3362 34\n",
|
|||
|
"6559 34\n",
|
|||
|
"792 34\n",
|
|||
|
"Name: count, Length: 5383, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение price в тестовой выборке после oversampling:\n",
|
|||
|
"price\n",
|
|||
|
"3742 33\n",
|
|||
|
"559 33\n",
|
|||
|
"8403 33\n",
|
|||
|
"1238 33\n",
|
|||
|
"1243 33\n",
|
|||
|
" ..\n",
|
|||
|
"1149 33\n",
|
|||
|
"2401 33\n",
|
|||
|
"958 33\n",
|
|||
|
"702 33\n",
|
|||
|
"14618 33\n",
|
|||
|
"Name: count, Length: 5338, dtype: int64\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from imblearn.over_sampling import RandomOverSampler\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"def oversample(df):\n",
|
|||
|
" X = df.drop(\"price\", axis=1)\n",
|
|||
|
" y = df[\"price\"]\n",
|
|||
|
"\n",
|
|||
|
" oversampler = RandomOverSampler(random_state=42)\n",
|
|||
|
" X_resampled, y_resampled = oversampler.fit_resample(X, y) # type: ignore\n",
|
|||
|
"\n",
|
|||
|
" resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
|
|||
|
" return resampled_df\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"train_df_oversampled = oversample(train_df)\n",
|
|||
|
"val_df_oversampled = oversample(val_df)\n",
|
|||
|
"test_df_oversampled = oversample(test_df)\n",
|
|||
|
"\n",
|
|||
|
"check_balance(train_df_oversampled, \"обучающей выборке после oversampling\")\n",
|
|||
|
"check_balance(val_df_oversampled, \"контрольной выборке после oversampling\")\n",
|
|||
|
"check_balance(test_df_oversampled, \"тестовой выборке после oversampling\")"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": "aisenv",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.12.6"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|