AIM-PIbd-32-Kuzin-P-S/lab_2/lab2.ipynb

1252 lines
437 KiB
Plaintext
Raw Permalink Normal View History

2024-10-10 23:38:34 +04:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Customer Personality Analysis\n",
"https://www.kaggle.com/datasets/imakash3011/customer-personality-analysis Набор представляет собой данные о покупателях\n",
"Пример цели: Узнать, кто больше всего покупает продукцию (вино)\n",
"Входные данные: год рождения, степень образования, статус отношений, сколько детей, сколько подростков, сколько было потрачено на вино"
]
},
{
"cell_type": "code",
"execution_count": 261,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['work_year', 'experience_level', 'employment_type', 'job_title',\n",
" 'salary', 'salary_currency', 'salary_in_usd', 'employee_residence',\n",
" 'remote_ratio', 'company_location', 'company_size'],\n",
" dtype='object')\n"
]
}
],
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"df = pd.read_csv(\".//static//csv//ds_salaries.csv\")\n",
"print(df.columns)"
]
},
{
"cell_type": "code",
"execution_count": 262,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAx8AAAIjCAYAAABia6bHAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA7kklEQVR4nO3dd5hV1b0//s8MMAMIMwOIFKWIYgFB7EGsERsYSxRLiIKxxKhRE0tiQEFjvhpbEonGGK+i6JWrXlssESIiSQRUFBGwBiyRJjoUpTP794e/OZdDGQbUNYO8Xs8zD5y911nrs/dZz5nznrNLQZZlWQAAAHzDCmu6AAAAYPMgfAAAAEkIHwAAQBLCBwAAkITwAQAAJCF8AAAASQgfAABAEsIHAACQhPABAAAkIXwAfEXvv/9+FBQUxNChQ2u6lE3e0KFDo6CgIN5///2aLmWtBg8eHAUFBTVdxlfSvn376N+/f02XAWymhA+g1qj84Lnqz1ZbbRUHH3xwPPPMM8nrGT16dF4t9erViw4dOsRpp50W06ZN+1rGePHFF2Pw4MExb968r6U/AKjNhA+g1rn66qtj2LBhce+998Zll10Wn3zySfTq1SuefPLJGqnnggsuiGHDhsUdd9wRvXv3jv/5n/+JvfbaK2bMmPGV+37xxRfjqquuEj42EQMHDozFixfXdBkAm6y6NV0AwOqOPPLI2HPPPXOPzzjjjGjRokU88MADcdRRRyWvZ//9948TTjghIiJOP/302GGHHeKCCy6Ie+65Jy6//PLk9VB9ixYtioYNG35t/dWtWzfq1vWrE2Bj+eYDqPXKysqiQYMGa3zo++KLL+Liiy+ONm3aRHFxcey4445x4403RpZlERGxePHi2GmnnWKnnXbK+2v1Z599Fq1atYp99903Vq5cucH1fPe7342IiOnTp1fZbtSoUbH//vvHFltsEWVlZXHMMcfEm2++mVs/ePDguPTSSyMiYtttt80d3lWd8x1WPzyt8mf06NFrtO3fv/9a2w4ePDiv3cMPPxx77rlnNG7cOK/djTfeWGUty5cvj6uuuio6duwY9evXj2bNmsV+++0XI0eOzLWZNGlS9O/fPzp06BD169ePli1bxo9+9KP49NNP17utjz/+ePTu3Ttat24dxcXFsd1228Wvf/3rNV67gw46KHbZZZeYMGFCHHDAAdGwYcP41a9+Ff369Ystt9wyli9fvkbfhx12WOy4447rraHS2s75KCgoiPPPPz8ee+yx2GWXXaK4uDg6d+4cf/vb36rdb8T/Hea3+mu4tnOKZs2aFaeffnpss802UVxcHK1atYpjjjkmb+5kWRbXXHNNbLPNNtGwYcM4+OCDY8qUKRtUE8DXzZ9vgFpn/vz5MXfu3MiyLObMmRNDhgyJzz//PH74wx/m2mRZFkcffXQ8//zzccYZZ0S3bt3i2WefjUsvvTQ+/vjj+N3vfhcNGjSIe+65J3r06BEDBgyIm2++OSIizjvvvJg/f34MHTo06tSps8H1/fvf/46IiGbNmq2zzd///vc48sgjo0OHDjF48OBYvHhxDBkyJHr06BGvvvpqtG/fPr7//e/HO++8Ew888ED87ne/iy233DIiIpo3b16tOg499NA47bTTIiLi5ZdfjltuuWWdbbfccsv43e9+l3t86qmn5q0fO3ZsnHjiibHrrrvGddddF6WlpTF37tz42c9+tt46Bg8eHNdee22ceeaZsffee8eCBQvilVdeiVdffTUOPfTQiIgYOXJkTJs2LU4//fRo2bJlTJkyJe64446YMmVKjBs3rsqTuIcOHRqNGjWKn//859GoUaMYNWpUXHnllbFgwYK44YYb8tp++umnceSRR8bJJ58cP/zhD6NFixaxxRZbxL333hvPPvts3jdns2bNilGjRsWgQYPWu43r889//jMeeeSROPfcc6Nx48Zxyy23xPHHHx8ffvhhlfNkYx1//PExZcqU+OlPfxrt27ePOXPmxMiRI+PDDz+M9u3bR0TElVdeGddcc0306tUrevXqFa+++mocdthhsWzZsq+9HoBqywBqibvvvjuLiDV+iouLs6FDh+a1feyxx7KIyK655pq85SeccEJWUFCQvffee7lll19+eVZYWJiNGTMme+ihh7KIyH7/+9+vt57nn38+i4jsrrvuyj755JNsxowZ2VNPPZW1b98+KygoyF5++eUsy7Js+vTpWURkd999d+653bp1y7baaqvs008/zS17/fXXs8LCwuy0007LLbvhhhuyiMimT59e7f20bNmyLCKy888/P7escruef/75Ndr37ds323bbbfOWRUQ2aNCg3OPLL788i4hs5syZuWWV23XDDTdUWc+uu+6a9e7du8o2ixYtWmPZAw88kEVENmbMmNyyyjmw6v5Y23N//OMfZw0bNsyWLFmSW3bggQdmEZHdfvvteW1XrlyZbbPNNtlJJ52Ut/zmm2/OCgoKsmnTplVZ+6oGDRqUrf6rMyKyoqKivDn3+uuvZxGRDRkypNp9V8631V/D1edXeXn5el+XOXPmZEVFRVnv3r2zioqK3PJf/epXWURk/fr1q3ZdAF8nh10Btc6tt94aI0eOjJEjR8Z9990XBx98cJx55pnxyCOP5No8/fTTUadOnbjgggvynnvxxRdHlmV5V8caPHhwdO7cOfr16xfnnntuHHjggWs8ryo/+tGPonnz5tG6devo3bt3fPHFF3HPPffknZeyqpkzZ8bEiROjf//+0bRp09zyrl27xqGHHhpPP/10tcdemyVLlkRERP369avVftmyZVFcXFxlm4ULF0ZhYWGUlZVtcD1lZWUxZcqUePfdd9fZpkGDBrn/L1myJObOnRvf+c53IiLi1VdfrbL/VZ+7cOHCmDt3buy///6xaNGieOutt/LaFhcXx+mnn563rLCwMPr27RtPPPFELFy4MLf8/vvvj3333Te23Xbb9W/kevTs2TO222673OOuXbtGSUnJ13ZVtFU1aNAgioqKYvTo0VFeXr7WNn//+99j2bJl8dOf/jTvW6WLLrroa68HYEMIH0Cts/fee0fPnj2jZ8+e0bdv33jqqaeiU6dOcf755+cOGfnggw+idevW0bhx47zn7rzzzrn1lYqKiuKuu+6K6dOnx8KFC+Puu+/eoHs1XHnllTFy5MgYNWpUTJo0KWbMmLHGYUurqhx7becS7LzzzjF37tz44osvqj3+6ubOnRsREaWlpdVqP2/evGjUqFGVbbp37x4VFRVx4YUXxr///e+YO3fuOj/Yru7qq6+OefPmxQ477BBdunSJSy+9NCZNmpTX5rPPPosLL7wwWrRoEQ0aNIjmzZvnPvTPnz+/yv6nTJkSxx13XJSWlkZJSUk0b948dwje6s/deuuto6ioaI0+TjvttFi8eHE8+uijERHx9ttvx4QJE6p8HTdE27Zt11jWpEmTau/DDVFcXBy//e1v45lnnokWLVrEAQccENdff33MmjUr16ZyDnbs2DHvuc2bN48mTZp87TUBVJfwAdR6hYWFcfDBB8fMmTOr/Ot6VZ599tmI+PKv7hvaR5cuXaJnz55x8MEHR5cuXWr8akeVJxVXHtu/PrNmzYqWLVtW2ebkk0+Oiy++OIYOHRrbb799NG/ePHbfffdq9X/AAQfEv//977jrrrtil112iTvvvDN23333uPPOO3NtTjzxxPjLX/4S55xzTjzyyCMxYsSI3AnZFRUV6+x73rx5ceCBB8brr78eV199dfz1r3+NkSNHxm9/+9u1PnfVb0lW1alTp9hjjz3ivvvui4iI++67L4qKiuLEE0+s1jauz7rOHcr+/4sfVMe6AvHaLopw0UUXxTvvvBPXXntt1K9fP6644orYeeed47XXXqv2eAA1QfgANgkrVqyIiIjPP/88IiLatWsXM2bMyDuMJiJyh+G0a9cut2zSpElx9dVXx+mnnx677bZbnHnmmev9a/tXUTn222+/vca6t956K7bccsvYYostImLdHzir8sorr0RErPOwr1UtX7483nvvvdw3QutSWFgYN954Yxx00EHRsWPH3CFv1dW0adM4/fT
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Визуализация данных - ящик с усами. Как видим - выборка относительно сбалансирована, есть среднее смещение в среднюю сторону, медиана уравновешена\n",
"plt.figure(figsize=(10, 6))\n",
"sns.boxplot(x=df[\"salary_in_usd\"])\n",
"plt.title(\"Box Plot для salary_in_usd\")\n",
"plt.xlabel(\"salary_in_usd\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 263,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA0sAAAIjCAYAAADSlID1AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABwG0lEQVR4nO3deXhU1f3H8c9MkpnsGxASICRBNsMii6AoixUsigtuFa2iWEW0WsTWDZeibRXFautScavi0l8FFyhWRFBxR3ABWYLIFjYhAbLvycz5/ZFmzJCbZDJMmADv1/PMo7lz7jnfe+bO8uHO3GszxhgBAAAAALzYg10AAAAAALRFhCUAAAAAsEBYAgAAAAALhCUAAAAAsEBYAgAAAAALhCUAAAAAsEBYAgAAAAALhCUAAAAAsEBYAgAAAAALhCUAaMR9990nm80W7DK8nHbaaTrttNOCXUaLfPzxx7LZbPr444+DXcoRzWaz6b777gt2GQGVnp6uSZMmBbsMSW2rFgBtB2EJAAAAACyEBrsAAIDvlixZEuwSECTl5eUKDeVtGwAOJ44sAcBhYIxReXn5IffjcDjkcDgCUBGOBG63WxUVFZKk8PBwwhIAHGaEJQBHleLiYk2bNk3p6elyOp1KSkrSGWecoe+++87T5rPPPtOvfvUrde3aVU6nU6mpqbrlllt8CjMvvfSSTj/9dCUlJcnpdCozM1OzZ89u0C49PV3nnHOO3n//fZ144omKiIjQs88+q1GjRumEE06w7LtXr14aO3Zsk+Mf/Julut8DzZs3Tw888IC6dOmi8PBwjR49Wps3b252eyZNmqT09PQGy61+r7V06VINHz5c8fHxio6OVq9evXTXXXd5tdm1a5fOP/98RUVFKSkpSbfccosqKyubrePNN9+UzWbTJ5980uC+Z599VjabTevWrZMk7d27V1dffbW6dOkip9OplJQUjR8/XtnZ2c2O88MPP+jiiy9WYmKiwsPDdeKJJ2rhwoWe+3Nzc9WhQweddtppMsZ4lm/evFlRUVGaMGGCZ9lpp52mvn376ttvv9Upp5yiiIgIZWRk6JlnnmkwbmVlpWbMmKHu3bt79rnbb7+9wdzYbDbddNNN+te//qU+ffrI6XRq8eLFnvsO/s3S7t279Zvf/EYdO3aU0+lUnz599OKLL3q1aek+smLFCo0bN04JCQmKiopS//799fjjj7doHg9FQUGBpk2bptTUVDmdTnXv3l0PP/yw3G63JKm6ulqJiYm6+uqrG6xbVFSk8PBw3XrrrZ5lvs49AFjhn6gAHFWuv/56vfnmm7rpppuUmZmpAwcO6PPPP9eGDRs0aNAgSdIbb7yhsrIy3XDDDWrXrp1WrlypJ598Urt27dIbb7zRZP+zZ89Wnz59dN555yk0NFTvvPOOfvvb38rtduvGG2/0artx40ZddtllmjJliiZPnqxevXopOjpakydP1rp169S3b19P26+//lo//vij7rnnHr+2+6GHHpLdbtett96qwsJCzZo1S5dffrlWrFjhV38HW79+vc455xz1799ff/rTn+R0OrV582Z98cUXnjbl5eUaPXq0duzYoalTp6pTp0569dVX9dFHHzXb/9lnn63o6GjNmzdPo0aN8rpv7ty56tOnj2e+LrroIq1fv16/+93vlJ6ertzcXC1dulQ7duywDH71t+HUU09V586ddeeddyoqKkrz5s3T+eefr7feeksXXHCBkpKSNHv2bP3qV7/Sk08+qalTp8rtdmvSpEmKiYnR008/7dVnfn6+xo0bp0suuUSXXXaZ5s2bpxtuuEEOh0O/+c1vJNUeHTrvvPP0+eef67rrrtPxxx+vtWvX6m9/+5t+/PFHLViwwKvPjz76SPPmzdNNN92k9u3bN7pNOTk5Ovnkkz0Bq0OHDnrvvfd0zTXXqKioSNOmTfNq78s+snTpUp1zzjlKSUnRzTffrOTkZG3YsEH//e9/dfPNN/s8j/4qKyvTqFGjtHv3bk2ZMkVdu3bVl19+qenTp2vPnj36+9//rrCwMF1wwQV6++239eyzz3odaV2wYIEqKyt16aWX+jX3ANCAAYCjSFxcnLnxxhubbFNWVtZg2cyZM43NZjPbt2/3LJsxY4Y5+GXSat2xY8eabt26eS1LS0szkszixYu9lhcUFJjw8HBzxx13eC2fOnWqiYqKMiUlJU3WPmrUKDNq1CjP38uWLTOSzPHHH28qKys9yx9//HEjyaxdu7bJ/q666iqTlpbWYPnB2/63v/3NSDL79u1rtK+///3vRpKZN2+eZ1lpaanp3r27kWSWLVvWZC2XXXaZSUpKMjU1NZ5le/bsMXa73fzpT38yxhiTn59vJJlHHnmkyb6sjB492vTr189UVFR4lrndbnPKKaeYHj16NKglMjLS/Pjjj+aRRx4xksyCBQu82owaNcpIMo8++qhnWWVlpRkwYIBJSkoyVVVVxhhjXn31VWO3281nn33mtf4zzzxjJJkvvvjCs0ySsdvtZv369Q3ql2RmzJjh+fuaa64xKSkpZv/+/V7tLr30UhMXF+fZV33dR2pqakxGRoZJS0sz+fn5Xn263W6/5rE5aWlp5qqrrvL8/ec//9lERUWZH3/80avdnXfeaUJCQsyOHTuMMca8//77RpJ55513vNqNGzfO67nYkrk/uBYAMMYYvoYH4KgSHx+vFStW6Keffmq0TUREhOf/S0tLtX//fp1yyikyxmjVqlVN9l9/3cLCQu3fv1+jRo3S1q1bVVhY6NU2IyOjwdfq4uLiNH78eP373//2fM3L5XJp7ty5nq+v+ePqq6/2+hf2ESNGSJK2bt3qV38Hi4+PlyT95z//8Xwd6mCLFi1SSkqKLr74Ys+yyMhIXXfddT6NMWHCBOXm5nqdYvzNN9+U2+32fP0tIiJCDodDH3/8sfLz832uPy8vTx999JEuueQSFRcXa//+/dq/f78OHDigsWPHatOmTdq9e7en/VNPPaW4uDhdfPHFuvfeezVx4kSNHz++Qb+hoaGaMmWK52+Hw6EpU6YoNzdX3377raTaI5nHH3+8evfu7Rl3//79Ov300yVJy5Yt8+pz1KhRyszMbHJ7jDF66623dO6558oY49Xv2LFjVVhY6PXVU6n5fWTVqlXatm2bpk2b5nm869R9JbOl89hSb7zxhkaMGKGEhASvbRozZoxcLpc+/fRTSdLpp5+u9u3ba+7cuZ518/PztXTpUq+vSrZ07gHgYIQlAEeVWbNmad26dUpNTdXQoUN13333NQgMO3bs0KRJk5SYmKjo6Gh16NDB89WvgwPPwb744guNGTNGUVFRio+PV4cOHTy/27EKS1auvPJK7dixQ5999pkk6YMPPlBOTo4mTpzo1zZLUteuXb3+TkhIkKQWBYqmTJgwQaeeeqquvfZadezYUZdeeqnmzZvnFZy2b9+u7t27N/itU69evXwa48wzz1RcXJzXB+C5c+dqwIAB6tmzpyTJ6XTq4Ycf1nvvvaeOHTtq5MiRmjVrlvbu3dtk35s3b5YxRvfee686dOjgdZsxY4ak2t8r1UlMTNQTTzyhNWvWKC4uTk888YRlv506dWoQcOtqrfsN1aZNm7R+/foG49a1qz+u1Ph+U9++fftUUFCg5557rkG/db/lObjf5vaRLVu2SJLX10MP1tJ5bKlNmzZp8eLFDfoeM2aMV9+hoaG66KKL9J///Mfz26O3335b1dXVXmGppXMPAAfjN0sAjiqXXHKJRowYofnz52vJkiV65JFH9PDDD+vtt9/WWWedJZfLpTPOOEN5eXm644471Lt3b0VFRWn37t2aNGlSo0dNpNoPk6NHj1bv3r312GOPKTU1VQ6HQ4sWLdLf/va3BuvWPwpV39ixY9WxY0e99tprGjlypF577TUlJyd7PhD6IyQkxHK5qXeSAiuNXXTX5XJ5/R0REaFPP/1Uy5Yt07vvvqvFixdr7ty5Ov3007VkyZJGx28Jp9Op888/X/Pnz9fTTz+tnJwcffHFF3rwwQe92k2bNk3nnnuuFixYoPfff1/33nuvZs6cqY8++kgDBw607Lvusbn11ls
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Визуализируем отношение размера компании и зарплаты\n",
"plt.figure(figsize=(10, 6))\n",
"plt.scatter(df[\"salary_in_usd\"], df[\"experience_level\"])\n",
"plt.xlabel(\"salary_in_usd\")\n",
"plt.ylabel(\"experience_level\")\n",
"plt.title(\"salary in usd vs experience_level\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 264,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Выбросы:\n",
" work_year experience_level employment_type \\\n",
"33 2023 SE FT \n",
"68 2023 SE FT \n",
"83 2022 EN FT \n",
"133 2023 SE FT \n",
"145 2023 SE FT \n",
"... ... ... ... \n",
"3522 2020 MI FT \n",
"3675 2021 EX CT \n",
"3697 2020 EX FT \n",
"3747 2021 MI FT \n",
"3750 2020 SE FT \n",
"\n",
" job_title salary salary_currency \\\n",
"33 Computer Vision Engineer 342810 USD \n",
"68 Applied Scientist 309400 USD \n",
"83 AI Developer 300000 USD \n",
"133 Machine Learning Engineer 342300 USD \n",
"145 Machine Learning Engineer 318300 USD \n",
"... ... ... ... \n",
"3522 Research Scientist 450000 USD \n",
"3675 Principal Data Scientist 416000 USD \n",
"3697 Director of Data Science 325000 USD \n",
"3747 Applied Machine Learning Scientist 423000 USD \n",
"3750 Data Scientist 412000 USD \n",
"\n",
" salary_in_usd employee_residence remote_ratio company_location \\\n",
"33 342810 US 0 US \n",
"68 309400 US 0 US \n",
"83 300000 IN 50 IN \n",
"133 342300 US 0 US \n",
"145 318300 US 100 US \n",
"... ... ... ... ... \n",
"3522 450000 US 0 US \n",
"3675 416000 US 100 US \n",
"3697 325000 US 100 US \n",
"3747 423000 US 50 US \n",
"3750 412000 US 100 US \n",
"\n",
" company_size \n",
"33 M \n",
"68 L \n",
"83 L \n",
"133 L \n",
"145 M \n",
"... ... \n",
"3522 M \n",
"3675 S \n",
"3697 L \n",
"3747 L \n",
"3750 L \n",
"\n",
"[63 rows x 11 columns]\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA1MAAAIjCAYAAADm7UHpAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAB+ZElEQVR4nO3deXwU9f3H8ffuJrub+w4JEJJwGw4RBAE5rIAg3iceKFRFa7UerXe1YlvFo5faitevYrWtIFWLVVFUvAABBUQOOQPhTMhB7mST3fn9EbNmc+5OFhPg9Xw88oDMfuc7n+8xk/ns7M5YDMMwBAAAAAAIiLWjAwAAAACAoxHJFAAAAACYQDIFAAAAACaQTAEAAACACSRTAAAAAGACyRQAAAAAmEAyBQAAAAAmkEwBAAAAgAkkUwAAAABgAskUALRg9uzZslgsHR2Gj9NOO02nnXZaR4cRkE8++UQWi0WffPJJR4dyVLNYLJo9e3ZHhxFUGRkZmjlzZkeHIalzxQLg6EEyBQAAAAAmhHR0AAAA/33wwQcdHQI6SGVlpUJC+LMNAJ0JV6YA4EdgGIYqKyvbXY/dbpfdbg9CRDgaeDweVVVVSZKcTifJFAB0MiRTAI4ppaWluu2225SRkSGHw6Hk5GRNmjRJa9as8Zb5/PPPdckll6hHjx5yOBxKS0vT7bff7ley89JLL+n0009XcnKyHA6HsrKyNHfu3CblMjIydPbZZ+v999/XySefrLCwMD333HMaP368TjzxxGbr7tevnyZPntzq9ht/Z6r++0gLFizQww8/rO7du8vpdGrChAnavn17m+2ZOXOmMjIymixv7vtiS5Ys0ZgxYxQbG6vIyEj169dP9913n0+ZvXv36vzzz1dERISSk5N1++23q7q6us04Fi5cKIvFok8//bTJa88995wsFos2bNggSTp48KB++tOfqnv37nI4HEpNTdV5552nXbt2tbmd7777ThdffLHi4+PldDp18skna9GiRd7X8/LylJSUpNNOO02GYXiXb9++XREREZo2bZp32WmnnaaBAwfq66+/1ujRoxUWFqbMzEw9++yzTbZbXV2tBx98UL179/bOubvuuqtJ31gsFt1888365z//qQEDBsjhcGjx4sXe1xp/Z2rfvn265ppr1KVLFzkcDg0YMEB///vffcoEOkdWrlypqVOnKi4uThERERo8eLCefPLJgPqxPQ4fPqzbbrtNaWlpcjgc6t27tx577DF5PB5JUk1NjeLj4/XTn/60ybolJSVyOp264447vMv87XsAMIO3uAAcU372s59p4cKFuvnmm5WVlaWCggJ98cUX2rx5s4YOHSpJev3111VRUaEbb7xRCQkJWrVqlZ5++mnt3btXr7/+eqv1z507VwMGDNC5556rkJAQvf322/r5z38uj8ejm266yafsli1bdPnll+uGG27QrFmz1K9fP0VGRmrWrFnasGGDBg4c6C27evVqbd26Vffff7+pdj/66KOyWq264447VFxcrMcff1xXXnmlVq5caaq+xjZu3Kizzz5bgwcP1m9/+1s5HA5t375dy5Yt85aprKzUhAkTlJOTo1tuuUVdu3bVK6+8oo8//rjN+s866yxFRkZqwYIFGj9+vM9r8+fP14ABA7z9ddFFF2njxo36xS9+oYyMDOXl5WnJkiXKyclpNjFs2IZTTz1V3bp10z333KOIiAgtWLBA559/vv7zn//oggsuUHJysubOnatLLrlETz/9tG655RZ5PB7NnDlTUVFReuaZZ3zqLCoq0tSpU3XppZfq8ssv14IFC3TjjTfKbrfrmmuukVR3dencc8/VF198oeuvv14nnHCCvv32W/35z3/W1q1b9dZbb/nU+fHHH2vBggW6+eablZiY2GKbcnNzNXLkSG8ClpSUpPfee0/XXnutSkpKdNttt/mU92eOLFmyRGeffbZSU1N16623KiUlRZs3b9b//vc/3XrrrX73o1kVFRUaP3689u3bpxtuuEE9evTQ8uXLde+99+rAgQP6y1/+otDQUF1wwQV644039Nxzz/lcqX3rrbdUXV2tyy67zFTfA0DADAA4hsTExBg33XRTq2UqKiqaLJszZ45hsViM3bt3e5c9+OCDRuPDZHPrTp482ejZs6fPsvT0dEOSsXjxYp/lhw8fNpxOp3H33Xf7LL/llluMiIgIo6ysrNXYx48fb4wfP977+9KlSw1JxgknnGBUV1d7lz/55JOGJOPbb79ttb4ZM2YY6enpTZY3bvuf//xnQ5Jx6NChFuv6y1/+YkgyFixY4F1WXl5u9O7d25BkLF26tNVYLr/8ciM5Odmora31Ljtw4IBhtVqN3/72t4ZhGEZRUZEhyXjiiSdaras5EyZMMAYNGmRUVVV5l3k8HmP06NFGnz59msQSHh5ubN261XjiiScMScZbb73lU2b8+PGGJOOPf/yjd1l1dbUxZMgQIzk52XC5XIZhGMYrr7xiWK1W4/PPP/dZ/9lnnzUkGcuWLfMuk2RYrVZj48aNTeKXZDz44IPe36+99lojNTXVyM/P9yl32WWXGTExMd656u8cqa2tNTIzM4309HSjqKjIp06Px2OqH9uSnp5uzJgxw/v77373OyMiIsLYunWrT7l77rnHsNlsRk5OjmEYhvH+++8bkoy3337bp9zUqVN99sVA+r5xLADgDz7mB+CYEhsbq5UrV2r//v0tlgkLC/P+v7y8XPn5+Ro9erQMw9DatWtbrb/husXFxcrPz9f48eO1c+dOFRcX+5TNzMxs8rG9mJgYnXfeefr3v//t/RiZ2+3W/PnzvR+PM+OnP/2pzzv0Y8eOlSTt3LnTVH2NxcbGSpL++9//ej9u1di7776r1NRUXXzxxd5l4eHhuv766/3axrRp05SXl+dzC/WFCxfK4/F4P14XFhYmu92uTz75REVFRX7HX1hYqI8//liXXnqpSktLlZ+fr/z8fBUUFGjy5Mnatm2b9u3b5y3/17/+VTExMbr44ov1wAMP6KqrrtJ5553XpN6QkBDdcMMN3t/tdrtuuOEG5eXl6euvv5ZUdyX0hBNOUP/+/b3bzc/P1+mnny5JWrp0qU+d48ePV1ZWVqvtMQxD//nPf3TOOefIMAyfeidPnqzi4mKfj7ZKbc+RtWvXKjs7W7fddpt3vOvVf+Qz0H4M1Ouvv66xY8cqLi7Op00TJ06U2+3WZ599Jkk6/fTTlZiYqPnz53vXLSoq0pIlS3w+ihlo3wNAoEimABxTHn/8cW3YsEFpaWkaMWKEZs+e3SShyMnJ0cyZMxUfH6/IyEglJSV5P1rWOCFqbNmyZZo4caIiIiIUGxurpKQk7/eGmkummnP11VcrJydHn3/+uSTpww8/VG5urq666ipTbZakHj16+PweFxcnSQElHK2ZNm2aTj31VF133XXq0qWLLrvsMi1YsMAnsdq9e7d69+7d5LtW/fr182sbU6ZMUUxMjM8J8vz58zVkyBD17dtXkuRwOPTYY4/pvffeU5cuXTRu3Dg9/vjjOnjwYKt1b9++XYZh6IEHHlBSUpLPz4MPPiip7vtS9eLj4/XUU09p/fr1iomJ0VNPPdVsvV27dm2SANfHWv8drm3btmnjxo1NtltfruF2pZbnTUOHDh3S4cOH9fzzzzept/67RI3rbWuO7NixQ5J8Pn7aWKD9GKht27Zp8eLFTeqeOHGiT90hISG66KKL9N///tf73ac33nhDNTU1PslUoH0PAIHiO1MAjimXXnqpxo4dqzfffFMffPCBnnjiCT322GN64403dOaZZ8rtdmvSpEkqLCzU3Xffrf79+ysiIkL79u3TzJkzW7zqItWdbE6YMEH9+/fXn/70J6Wlpclut+vdd9/Vn//85ybrNryK1dDkyZPVpUsXvfrqqxo3bpxeffVVpaSkeE8YzbDZbM0uNxrcRKE5LT2U2O12+/weFhamzz77TEuXLtU777yjxYsXa/78+Tr99NP1wQcftLj9QDgcDp1//vl688039cwzzyg3N1fLli3TI4884lPutttu0znnnKO33npL77//vh544AHNmTNHH3/8sU466aR
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Есть шумы, убираем\n",
"\n",
"\n",
"# Статистический анализ для определения выбросов\n",
"Q1 = df[\"salary_in_usd\"].quantile(0.25)\n",
"Q3 = df[\"salary_in_usd\"].quantile(0.75)\n",
"IQR = Q3 - Q1\n",
"\n",
"# Определение порога для выбросов\n",
"threshold = 1.5 * IQR\n",
"outliers = (df[\"salary_in_usd\"] < (Q1 - threshold)) | (\n",
" df[\"salary_in_usd\"] > (Q3 + threshold)\n",
")\n",
"\n",
"# Вывод выбросов\n",
"print(\"Выбросы:\")\n",
"print(df[outliers])\n",
"\n",
"# Обработка выбросов\n",
"# В данном случае мы уберем выбросы\n",
"median_salary = df[\"salary_in_usd\"].median()\n",
"df.loc[outliers, \"salary_in_usd\"] = 0\n",
"df = df[df.salary_in_usd != 0]\n",
"\n",
"# Визуализация данных после обработки\n",
"plt.figure(figsize=(10, 6))\n",
"plt.scatter(df[\"salary_in_usd\"], df[\"experience_level\"])\n",
"plt.xlabel(\"salary_in_usd\")\n",
"plt.ylabel(\"experience_level\")\n",
"plt.title(\"salary in usd vs experience_level\")\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Теперь создадим выборки."
]
},
{
"cell_type": "code",
"execution_count": 265,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Размер обучающей выборки: 2214\n",
"Размер контрольной выборки: 739\n",
"Размер тестовой выборки: 739\n",
"Распределение salary_in_usd в обучающей выборке:\n",
"salary_in_usd\n",
"130000 60\n",
"150000 59\n",
"100000 56\n",
"160000 56\n",
"120000 52\n",
" ..\n",
"127500 1\n",
"9466 1\n",
"57872 1\n",
"134024 1\n",
"122900 1\n",
"Name: count, Length: 741, dtype: int64\n",
"\n",
"Распределение salary_in_usd в контрольной выборке:\n",
"salary_in_usd\n",
"100000 25\n",
"150000 20\n",
"140000 19\n",
"120000 16\n",
"135000 16\n",
" ..\n",
"240500 1\n",
"93919 1\n",
"77364 1\n",
"87738 1\n",
"99050 1\n",
"Name: count, Length: 354, dtype: int64\n",
"\n",
"Распределение salary_in_usd в тестовой выборке:\n",
"salary_in_usd\n",
"120000 23\n",
"150000 19\n",
"100000 18\n",
"160000 16\n",
"200000 13\n",
" ..\n",
"109000 1\n",
"133000 1\n",
"245000 1\n",
"51039 1\n",
"146300 1\n",
"Name: count, Length: 364, dtype: int64\n",
"\n"
]
}
],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"# Загрузка данных\n",
"train_df = pd.read_csv(\".//static//csv//train_data.csv\")\n",
"val_df = pd.read_csv(\".//static//csv//val_data.csv\")\n",
"test_df = pd.read_csv(\".//static//csv//test_data.csv\")\n",
"\n",
"# Разделение на обучающую и тестовую выборки\n",
"train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)\n",
"\n",
"# Разделение обучающей выборки на обучающую и контрольную\n",
"train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)\n",
"\n",
"print(\"Размер обучающей выборки:\", len(train_df))\n",
"print(\"Размер контрольной выборки:\", len(val_df))\n",
"print(\"Размер тестовой выборки:\", len(test_df))\n",
"\n",
"\n",
"def check_balance(df, name):\n",
" counts = df[\"salary_in_usd\"].value_counts()\n",
" print(f\"Распределение salary_in_usd в {name}:\")\n",
" print(counts)\n",
" print()\n",
"\n",
"\n",
"check_balance(train_df, \"обучающей выборке\")\n",
"check_balance(val_df, \"контрольной выборке\")\n",
"check_balance(test_df, \"тестовой выборке\")"
]
},
{
"cell_type": "code",
"execution_count": 266,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение salary_in_usd в обучающей выборке после oversampling:\n",
"salary_in_usd\n",
"127221 60\n",
"105000 60\n",
"100000 60\n",
"260000 60\n",
"130000 60\n",
" ..\n",
"110000 60\n",
"113900 60\n",
"54685 60\n",
"193900 60\n",
"50000 60\n",
"Name: count, Length: 741, dtype: int64\n",
"\n",
"Распределение salary_in_usd в контрольной выборке после oversampling:\n",
"salary_in_usd\n",
"99050 25\n",
"126277 25\n",
"38400 25\n",
"56738 25\n",
"215050 25\n",
" ..\n",
"75000 25\n",
"140000 25\n",
"100000 25\n",
"175000 25\n",
"90734 25\n",
"Name: count, Length: 354, dtype: int64\n",
"\n",
"Распределение salary_in_usd в тестовой выборке после oversampling:\n",
"salary_in_usd\n",
"219000 23\n",
"143860 23\n",
"72500 23\n",
"140000 23\n",
"66837 23\n",
" ..\n",
"126000 23\n",
"109000 23\n",
"220000 23\n",
"250000 23\n",
"80000 23\n",
"Name: count, Length: 364, dtype: int64\n",
"\n"
]
}
],
"source": [
"from imblearn.over_sampling import RandomOverSampler\n",
"\n",
"\n",
"def oversample(df):\n",
" X = df.drop(\"salary_in_usd\", axis=1)\n",
" y = df[\"salary_in_usd\"]\n",
"\n",
" oversampler = RandomOverSampler(random_state=42)\n",
" X_resampled, y_resampled = oversampler.fit_resample(X, y) # type: ignore\n",
"\n",
" resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
" return resampled_df\n",
"\n",
"\n",
"train_df_oversampled = oversample(train_df)\n",
"val_df_oversampled = oversample(val_df)\n",
"test_df_oversampled = oversample(test_df)\n",
"\n",
"check_balance(train_df_oversampled, \"обучающей выборке после oversampling\")\n",
"check_balance(val_df_oversampled, \"контрольной выборке после oversampling\")\n",
"check_balance(test_df_oversampled, \"тестовой выборке после oversampling\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Forbes Billionaires Database"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"https://www.kaggle.com/datasets/surajjha101/forbes-billionaires-data-preprocessed Список биллионеров форбс\n",
"Использование: Узнать, когда же разбогатеешь\n",
"Входные данные: Имя, Возраст, Страна, компания, Индустрия"
]
},
{
"cell_type": "code",
"execution_count": 267,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['Rank ', 'Name', 'Networth', 'Age', 'Country', 'Source', 'Industry'], dtype='object')\n"
]
}
],
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"df = pd.read_csv(\".//static//csv//Forbes Billionaires.csv\")\n",
"print(df.columns)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Анализируем датафрейм при помощи \"ящика с усами\". Естьсмещение в сторону меньших значений, это можно исправить при помощи oversampling и undersampling."
]
},
{
"cell_type": "code",
"execution_count": 268,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAx8AAAIjCAYAAABia6bHAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAsdElEQVR4nO3de5xVVd348e9cYJgEhhQFRgFBDZRCUUmRsHikvKBpKYqhQuCjJqSYlRgmWPLoKy0zCy89ChaEiSlmQoqCGMajoCGSBah4F1CRqyiXWb8/+nFeTXgZbmsYfL9fr/PK2XudmXWW25gPZ+99ilJKKQAAALaz4tqeAAAA8MkgPgAAgCzEBwAAkIX4AAAAshAfAABAFuIDAADIQnwAAABZiA8AACAL8QEAAGQhPgB2ci+++GIUFRXF6NGja3sqAHzCiQ+AGho9enQUFRVVe+yxxx7RvXv3mDRpUvb5PPLII9XmUq9evWjbtm2cddZZ8cILL2yTn/HXv/41hg8fHsuWLdsm36+2TZw4MYqKiqKysjKqqqpqezoAnziltT0BgLrmRz/6UbRp0yZSSrF48eIYPXp0HHfccXHffffF8ccfn30+F1xwQXTu3DnWrVsXTz31VNxyyy1x//33xzPPPBOVlZVb9b3/+te/xhVXXBH9+vWLJk2abJsJ16KxY8fG3nvvHS+++GJMmTIlevToUdtTAvhE8c4HwGY69thj44wzzogzzzwzvvvd78Zf/vKXqFevXowbN65W5tOtW7c444wz4pvf/GbccMMNce2118bSpUvj9ttvr5X57KhWr14d9957b3znO9+JTp06xdixY2t7SgCfOOIDYCs1adIkysvLo7S0+pvJq1evjosvvjhatmwZZWVl0a5du7j22msjpRQREWvWrIn27dtH+/btY82aNYXnLV26NFq0aBFHHHFEbNiwYbPn81//9V8REbFw4cKPHDdlypTo1q1b7LLLLtGkSZM48cQT4x//+Edh//Dhw+N73/teRES0adOmcHrXiy+++LFz+M/T0zY+HnnkkU3G9uvX7wPHDh8+vNq4u+66Kw499NBo1KhRtXHXXnvtx84nIuKee+6JNWvWRK9evaJ3795x9913x3vvvbfJuDVr1sQFF1wQTZs2jUaNGsVXv/rVeO211z5wTq+99lr0798/mjVrFmVlZdGhQ4e47bbbajQfgE8ip10BbKbly5fHW2+9FSmlWLJkSdxwww2xatWqOOOMMwpjUkrx1a9+NaZOnRoDBgyIgw46KB544IH43ve+F6+99lpcd911UV5eHrfffnt07do1hg4dGj/72c8iImLgwIGxfPnyGD16dJSUlGz2/J5//vmIiNhtt90+dMxDDz0Uxx57bLRt2zaGDx8ea9asiRtuuCG6du0aTz31VOy9997x9a9/PebPnx/jxo2L6667Lpo2bRoREbvvvnuN5vHlL385zjrrrIiImDlzZvziF7/40LFNmzaN6667rvD1mWeeWW3/jBkz4tRTT40DDzwwrr766qioqIi33norLrroohrNJeJfp1x17949mjdvHr17944hQ4bEfffdF7169ao2rl+/fnHnnXfGmWeeGYcffnhMmzYtevbsucn3W7x4cRx++OFRVFQUgwYNit133z0mTZoUAwYMiBUrVsTgwYNrPDeAT4wEQI2MGjUqRcQmj7KysjR69OhqYydMmJAiIl155ZXVtp9yyimpqKgoPffcc4Vtl156aSouLk6PPvpoGj9+fIqI9POf//xj5zN16tQUEem2225Lb775Znr99dfT/fffn/bee+9UVFSUZs6cmVJKaeHChSki0qhRowrPPeigg9Iee+yR3n777cK2p59+OhUXF6ezzjqrsO2aa65JEZEWLlxY43Vau3Ztiog0aNCgwraNr2vq1KmbjO/Tp09q06ZNtW0RkYYNG1b4+tJLL00Rkd54443Cto2v65prrvnYOS1evDiVlpamX//614VtRxxxRDrxxBOrjXvyySdTRKTBgwdX296vX79N5jRgwIDUokWL9NZbb1Ub27t371RRUZHefffdj50XwCeN064ANtOvfvWrmDx5ckyePDnGjBkT3bt3j7PPPjvuvvvuwpiJEydGSUlJXHDBBdWee/HFF0dKqdrdsYYPHx4dOnSIvn37xvnnnx9f/OIXN3neR+nfv3/svvvuUVlZGT179ozVq1fH7bffHoceeugHjn/jjTdi9uzZ0a9fv9h1110L2zt27Bhf/vKXY+LEiTX+2R9k46lMDRo0qNH4tWvXRllZ2UeOWblyZRQXF2/xRe933HFHFBcXx8knn1zYdvrpp8ekSZPinXfeKWz785//HBER559/frXnf/vb3672dUop/vCHP8QJJ5wQKaV46623Co+jjz46li9fHk899dQWzRVgZ+a0K4DN9PnPf77aL/ann356dOrUKQYNGhTHH3981K9fP1566aWorKyMRo0aVXvu/vvvHxERL730UmFb/fr147bbbovOnTtHgwYNYtSoUVFUVFTj+Vx++eXRrVu3KCkpiaZNm8b++++/yfUn/27jz27Xrt0m+/bff/944IEHYvXq1bHLLrvUeA7/7q233oqIiIqKihqNX7ZsWTRs2PAjx3Tp0iV++ctfxoUXXhjf//73o6Kiolo0fJwxY8bE5z//+Xj77bfj7bffjoiITp06xdq1a2P8+PFxzjnnRMS/1qa4uDjatGlT7fn77rtvta/ffPPNWLZsWdxyyy1xyy23fODPXLJkSY3nB/BJIT4AtlJxcXF07949rr/++liwYEF06NBhs7/HAw88EBH/etdgwYIFm/zy+1E+97nP7VC3jN14Qfree+9do/GLFi2K1q1bf+SY3r17x1NPPRU33HDDh/6y/2EWLFgQM2fOjIiI/fbbb5P9Y8eOLcRHTW38jJAzzjgj+vbt+4FjOnbsuFnfE+CTQHwAbAPr16+PiIhVq1ZFRETr1q3joYceipUrV1Z79+Of//xnYf9Gc+bMiR/96EfxzW9+M2bPnh1nn312PPPMMzV+52BzbfzZ8+bN22TfP//5z2jatGnhXY/NeQdmo1mzZkVEfOhpX/9u3bp18dxzz8UxxxzzkeOKi4vj2muvjWeeeSYWLlwYI0eOjMWLF1e7yP/DjB07NurVqxe//e1vN7mAf/r06fGLX/wiXn755WjVqlW0bt06qqqqYuHChdVC5bnnnqv2vN133z0aNWoUGzZs2KHCD2BH55oPgK20bt26ePDBB6N+/fqF06qOO+642LBhQ/zyl7+sNva6666LoqKiOPbYYwvP7devX1RWVsb1118fo0ePjsWLF2/WXZw2V4sWLeKggw6K22+/vdonl8+dOzcefPDBOO644wrbNkbI5nzC+V133RXt2rWL9u3bf+zYe++9N9asWVO4PfBHueGGG2LKlCkxduzY6NGjR3Tt2rVG8xk7dmx069YtTjvttDjllFOqPTbeSnjjZ7QcffTRERExcuTITX72vyspKYmTTz45/vCHP8TcuXM3+ZlvvvlmjeYG8EnjnQ+AzTRp0qTCOxhLliyJ3/3ud7FgwYIYMmRING7cOCIiTjjhhOjevXsMHTo0XnzxxTjwwAPjwQcfjHvvvTcGDx4c++yzT0REXHnllTF79ux4+OGHo1GjRtGxY8e4/PLL47LLLotTTjmlWghsS9dcc00ce+yx0aVLlxgwYEDhVrsVFRXVPsvikEMOiYiIoUOHRu/evaNevXpxwgknfOD1IC+88EL85Cc/iSeeeCK+/vWvx5gxYwr7Np72NHny5GjVqlU0b948hg0bFiNHjowjjjgivvKVr3zkfP/+97/H97///Rg+fHh07ty5xq/z8ccfj+eeey4GDRr0gfv33HPPOPjgg2Ps2LFxySWXxCGHHBInn3xy/PznP4+33367cKvd+fPnR0T1d4KuvvrqmDp1ahx22GHx3//933HAAQfE0qVL46mnnoqHHnooli5dWuN5Anxi1PLdtgDqjA+61W6DBg3SQQcdlG688cZUVVVVbfzKlSvTRRddlCorK1O9evXSfvvtl6655prCuCeffDKVlpamb3/
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"# Box plot для столбца Age\n",
"plt.figure(figsize=(10, 6))\n",
"sns.boxplot(x=df['Age'])\n",
"plt.title('Box Plot для Age')\n",
"plt.xlabel('Age')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 269,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA1IAAAIjCAYAAAAJLyrXAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAB6P0lEQVR4nO3deXQUVfr/8U8nZGNJQpCQgCwBF4hsoiyRTREERRTRGUEYcUOHAXdnxHEUkFF0/Pobd3AbdQQBnUEQFxwUAZGwCIKGKAJGUUmIEEnCEgLp+v0Ru6VJursq3en1/Ton50DVTdWtSnV1PXXvfa7NMAxDAAAAAADTYoJdAQAAAAAINwRSAAAAAGARgRQAAAAAWEQgBQAAAAAWEUgBAAAAgEUEUgAAAABgEYEUAAAAAFhEIAUAAAAAFhFIAQAAAIBFBFIAAMu+++472Ww2vfLKK8GuioulS5eqe/fuSkxMlM1m0/79+4NdJYSgdu3a6eKLLw52NQCEOQIpADjOl19+qSuuuEJt27ZVYmKiWrVqpSFDhuipp56qt32+/vrrevzxx2ss3717t6ZNm6bNmzfX275PtGLFCtlsNudPXFyc2rdvr6uvvlrffvutX/axZs0aTZs2ze9Bzr59+/T73/9eSUlJeuaZZ/Taa6+pUaNGXn/v2Weflc1mU+/evf1an1DjCH5tNpv++9//1lg/bdo02Ww27d271/K26+tv6ov8/HxNmzZN3333XbCrAiBCEUgBwK/WrFmjs88+W1u2bNGECRP09NNP64YbblBMTIyeeOKJetuvp0Bq+vTpAQ2kHG655Ra99tprev755zV8+HAtWLBAPXv21O7du33e9po1azR9+nS/P3Rv2LBB5eXlmjFjhq6//nqNGzdOcXFxXn9v7ty5ateundavX68dO3b4tU6h6oEHHpBhGH7bXn39TX2Rn5+v6dOnE0gBqDcNgl0BAAgVDz74oFJSUrRhwwalpqa6rCsuLg5OperBwYMHvbbU9O/fX1dccYUk6dprr9Vpp52mW265Ra+++qruueeeQFTTMsff6MS/nScFBQVas2aNFi5cqJtuuklz587V1KlT66mGoaF79+7avHmz3nrrLY0aNSrY1fG7iooKxcfHB7saAKIALVIA8KudO3fqjDPOqPVBPD09vcayOXPmqFevXmrYsKGaNm2qAQMG6H//+59z/eLFizV8+HC1bNlSCQkJ6tChg2bMmKGqqipnmXPPPVfvvvuuvv/+e2e3q3bt2mnFihXq2bOnpOpAxrHu+DFJ69at07Bhw5SSkqKGDRtq4MCB+vTTT13q6OiulZ+fr6uuukpNmzZVv379LJ+bQYMGSaoOPDxZvny5+vfvr0aNGik1NVWXXnqpvvrqK5f6/PnPf5YkZWVlOY/LW6vBm2++qbPOOktJSUk66aSTNG7cOP3000/O9eeee67Gjx8vSerZs6dsNpuuueYar8c1d+5cNW3aVMOHD9cVV1yhuXPn1lpu3759+sMf/qDk5GSlpqZq/Pjx2rJlS63jxL7++mtdccUVSktLU2Jios4++2y9/fbbHutx9OhRpaWl6dprr62xrqysTImJibrrrrucy5566imdccYZzmvv7LPP1uuvv+71eCVp9OjROu2000y3Snm7zjz9TUeNGqUePXq4bG/EiBGy2Wwu52TdunWy2Wx6//33ncu+/fZb/e53v1NaWpoaNmyoPn366N1333XZlqMr6vz58/W3v/1NrVq1UsOGDfXkk0/qd7/7nSTpvPPOc9ZpxYoVLr+/evVq9erVS4mJiWrfvr3+/e9/mzqHACARSAGAU9u2bbVx40bl5eV5LTt9+nT94Q9/UFxcnB544AFNnz5drVu31vLly51lXnnlFTVu3Fh33HGHnnjiCZ111lm6//77NWXKFGeZe++9V927d9dJJ52k1157Ta+99poef/xxderUSQ888IAk6cYbb3SuGzBggKTqgGXAgAEqKyvT1KlT9dBDD2n//v0aNGiQ1q9fX6O+v/vd73To0CE99NBDmjBhguVzs3PnTklSs2bN3Jb58MMPNXToUBUXF2vatGm64447tGbNGvXt29cZKI0aNUpjxoyRJP3zn/90Hlfz5s3dbveVV17R73//e8XGxmrmzJmaMGGCFi5cqH79+jm7kt1777268cYbJVV3W3vttdd00003eT2uuXPnatSoUYqPj9eYMWO0fft2bdiwwaWM3W7XiBEjNG/ePI0fP14PPvigCgsLnYHb8bZu3ao+ffroq6++0pQpU/TYY4+pUaNGGjlypN566y239YiLi9Nll12mRYsWqbKy0mXdokWLdOTIEY0ePVqS9MILL+iWW25Rdna2Hn/8cU2fPl3du3fXunXrvB6vJMXGxupvf/ubtmzZ4rFOkrnrzNPftH///tqyZYvKysokSYZh6NNPP1VMTIw++eQT534++eQTxcTEqG/fvpKkPXv26JxzztEHH3ygP/3pT3rwwQdVUVGhSy65pNY6z5gxQ++++67uuusuPfTQQ7rgggt0yy23SJL++te/OuvUqVMn5+/s2LFDV1xxhYYMGaLHHntMTZs21TXXXKOtW7eaOo8AIAMAYBiGYfzvf/8zYmNjjdjYWCMnJ8f4y1/+YnzwwQdGZWWlS7nt27cbMTExxmWXXWZUVVW5rLPb7c5/Hzp0qMY+brrpJqNhw4ZGRUWFc9nw4cONtm3b1ii7YcMGQ5Lx8ssv19jHqaeeagwdOrTG/rKysowhQ4Y4l02dOtWQZIwZM8bUOfj4448NSca//vUv4+effzZ2795tvPvuu0a7du0Mm81mbNiwwTAMwygoKKhRt+7duxvp6enGvn37nMu2bNlixMTEGFdffbVz2aOPPmpIMgoKCrzWp7Ky0khPTzc6d+5sHD582Ln8nXfeMSQZ999/v3PZyy+/bEhy1tGbzz77zJBkLFu2zDCM6vN68sknG7feeqtLuf/+97+GJOPxxx93LquqqjIGDRpU4xycf/75RpcuXVz+vna73TjnnHOMU0891WN9PvjgA0OSsWTJEpflF110kdG+fXvn/y+99FLjjDPOMHWMx3P8zR599FHj2LFjxqmnnmp069bNeQ05rpWff/7ZWW+z15m7v6njGn7vvfcMwzCML774wpBk/O53vzN69+7tLHfJJZcYZ555pvP/t912myHJ+OSTT5zLysvLjaysLKNdu3bOz53jem3fvn2Nz9ubb75pSDI+/vjjGueibdu2hiRj1apVzmXFxcVGQkKCceedd5o6nwBAixQA/GrIkCHKzc3VJZdcoi1btugf//iHhg4dqlatWrl0Q1q0aJHsdrvuv/9+xcS43kZtNpvz30lJSc5/l5eXa+/everfv78OHTqkr7/+us713Lx5s7Zv366rrrpK+/bt0969e7V3714dPHhQ559/vlatWiW73e7yO3/84x8t7eO6665T8+bN1bJlSw0fPlwHDx7Uq6++qrPPPrvW8oWFhdq8ebOuueYapaWlOZd37dpVQ4YM0XvvvWf9QCV99tlnKi4u1p/+9CclJiY6lw8fPlwdO3as0dXLirlz56pFixY677zzJFX/7a688krNnz/fpfvl0qVLFRcX59KSFxMTo0mTJrlsr6SkRMuXL9fvf/97599779692rdvn4YOHart27e7dEc80aBBg3TSSSdpwYIFzmW//PKLli1bpiuvvNK5LDU1VT/++GONljMrjm+VWrRoUa1l6nKdnejMM89U48aNtWrVKknVLU8nn3yyrr76am3atEmHDh2SYRhavXq1+vfv7/y99957T7169XLphtq4cWPdeOON+u6775Sfn++yn/Hjx7t83szIzs522Wfz5s11+umn+y07JYDIRyAFAMfp2bOnFi5cqF9++UXr16/XPffco/Lycl1xxRXOh7edO3cqJiZG2dnZHre1detWXXbZZUpJSVFycrKaN2+ucePGSZJKS0vrXMft27dLqn54bN68ucvPiy++qCNHjtTYflZWlqV93H///Vq2bJmWL1+uL774Qrt379Yf/vAHt+W///57SdLpp59eY12nTp2cD+BWedpux44
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Выбросы:\n",
" Rank Name Networth Age Country Source \\\n",
"1311 1292 Kevin David Lehmann 2.4 19 Germany drugstores \n",
"1961 1929 Henrique Dubugras 1.5 26 Brazil fintech \n",
"1975 1929 Pedro Franceschi 1.5 25 Brazil fintech \n",
"2062 1929 Wang Zelong 1.5 25 China chemicals \n",
"2190 2190 Alexandra Andresen 1.3 25 Norway investments \n",
"2191 2190 Katharina Andresen 1.3 26 Norway investments \n",
"\n",
" Industry \n",
"1311 Fashion & Retail \n",
"1961 Finance & Investments \n",
"1975 Finance & Investments \n",
"2062 Metals & Mining \n",
"2190 diversified \n",
"2191 diversified \n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA1IAAAIjCAYAAAAJLyrXAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABusklEQVR4nO3de3yT5f3/8XdaSkuBphQoLcihoA4KKGeonBRBUIYiuokDRefQMVDnGecU8cTUr7/NI6hz0wkCc0MRD91QTiKFIggTigyxHmmptPbAoRya+/dHl9i0SXrfadIkzev5ePB40Pu+k1xJ7rT3O9d1fS6bYRiGAAAAAACmxYS6AQAAAAAQaQhSAAAAAGARQQoAAAAALCJIAQAAAIBFBCkAAAAAsIggBQAAAAAWEaQAAAAAwCKCFAAAAABYRJACAAAAAIsIUgAAy7788kvZbDa9/PLLoW6Km+zsbPXr108JCQmy2WwqLS0NdZMQhrp166af/vSnoW4GgAhHkAKAGj799FNdfvnl6tq1qxISEtSpUyeNGzdOTz/9dNAe87XXXtOf/vSnOtsPHDig+++/Xzt27AjaY9e2bt062Ww217+4uDh1795dV199tb744ouAPMamTZt0//33BzzkFBcX6+c//7latGihZ599Vq+++qpatmxZ7+2ee+452Ww2DR06NKDtCTfO8Guz2fTPf/6zzv77779fNptNhw4dsnzfwXpPGyIvL0/333+/vvzyy1A3BUATRZACgP/ZtGmTBg0apJ07d2rmzJl65pln9Ktf/UoxMTF68skng/a4voLU/PnzGzVIOd1000169dVX9cILL2jixIlavny5Bg8erAMHDjT4vjdt2qT58+cH/KJ769atqqio0IMPPqjrrrtO06dPV1xcXL23W7Jkibp166bc3Fx9/vnnAW1TuHrggQdkGEbA7i9Y72lD5OXlaf78+QQpAEHTLNQNAIBw8fDDD8tut2vr1q1KTk5221dUVBSaRgXBkSNH6u2pGTlypC6//HJJ0rXXXqszzzxTN910k1555RXdfffdjdFMy5zvUe33zpf8/Hxt2rRJK1as0A033KAlS5Zo3rx5QWpheOjXr5927NihN954Q1OmTAl1cwKusrJSzZs3D3UzAEQBeqQA4H/279+v3r17e7wQT01NrbNt8eLFGjJkiBITE9WmTRuNGjVK//73v137V65cqYkTJ6pjx46Kj49Xjx499OCDD6qqqsp1zLnnnqt33nlHX331lWvYVbdu3bRu3ToNHjxYUnWQce6rOSdpy5YtmjBhgux2uxITEzV69Gh99NFHbm10DtfKy8vTL37xC7Vp00YjRoyw/NqMGTNGUnXw8GXNmjUaOXKkWrZsqeTkZF1yySXas2ePW3vuuOMOSVJGRobredXXa/D6669r4MCBatGihdq1a6fp06fru+++c+0/99xzNWPGDEnS4MGDZbPZdM0119T7vJYsWaI2bdpo4sSJuvzyy7VkyRKPxxUXF+uqq65SUlKSkpOTNWPGDO3cudPjPLHPPvtMl19+uVJSUpSQkKBBgwbprbfe8tmOkydPKiUlRddee22dfeXl5UpISNDtt9/u2vb000+rd+/ernNv0KBBeu211+p9vpI0depUnXnmmaZ7peo7z3y9p1OmTNGAAQPc7m/SpEmy2Wxur8mWLVtks9n03nvvubZ98cUX+tnPfqaUlBQlJiZq2LBheuedd9zuyzkUddmyZfr973+vTp06KTExUU899ZR+9rOfSZLOO+88V5vWrVvndvuNGzdqyJAhSkhIUPfu3fW3v/3N1GsIABJBCgBcunbtqm3btmnXrl31Hjt//nxdddVViouL0wMPPKD58+erc+fOWrNmjeuYl19+Wa1atdKtt96qJ598UgMHDtR9992nuXPnuo6555571K9fP7Vr106vvvqqXn31Vf3pT39Sr1699MADD0iSrr/+ete+UaNGSaoOLKNGjVJ5ebnmzZunRx55RKWlpRozZoxyc3PrtPdnP/uZjh49qkceeUQzZ860/Nrs379fktS2bVuvx7z//vsaP368ioqKdP/99+vWW2/Vpk2bNHz4cFdQmjJliq688kpJ0h//+EfX82rfvr3X+3355Zf185//XLGxsVqwYIFmzpypFStWaMSIEa6hZPfcc4+uv/56SdXD1l599VXdcMMN9T6vJUuWaMqUKWrevLmuvPJK7du3T1u3bnU7xuFwaNKkSVq6dKlmzJihhx9+WAUFBa7gVtPu3bs1bNgw7dmzR3PnztUTTzyhli1bavLkyXrjjTe8tiMuLk6XXnqp3nzzTZ04ccJt35tvvqnjx49r6tSpkqQXX3xRN910kzIzM/WnP/1J8+fPV79+/bRly5Z6n68kxcbG6ve//7127tzps02SufPM13s6cuRI7dy5U+Xl5ZIkwzD00UcfKSYmRh9++KHrcT788EPFxMRo+PDhkqSDBw/qnHPO0b/+9S/95je/0cMPP6zKykpdfPHFHtv84IMP6p133tHtt9+uRx55RBdccIFuuukmSdLvfvc7V5t69erlus3nn3+uyy+/XOPGjdMTTzyhNm3a6JprrtHu3btNvY4AIAMAYBiGYfz73/82YmNjjdjYWCMrK8u48847jX/961/GiRMn3I7bt2+fERMTY1x66aVGVVWV2z6Hw+H6/9GjR+s8xg033GAkJiYalZWVrm0TJ040unbtWufYrVu3GpKMv/71r3Ue44wzzjDGjx9f5/EyMjKMcePGubbNmzfPkGRceeWVpl6DtWvXGpKMv/zlL8b3339vHDhwwHjnnXeMbt26GTabzdi6dathGIaRn59fp239+vUzUlNTjeLiYte2nTt3GjExMcbVV1/t2vb4448bkoz8/Px623PixAkjNTXV6NOnj3Hs2DHX9rffftuQZNx3332ubX/9618NSa421ufjjz82JBmrV682DKP6dT3ttNOMm2++2e24f/7zn4Yk409/+pNrW1VVlTFmzJg6r8H5559v9O3b1+39dTgcxjnnnGOcccYZPtvzr3/9y5BkrFq1ym37RRddZHTv3t318yWXXGL07t3b1HOsyfmePf7448apU6eMM844wzj77LNd55DzXPn+++9d7TZ7nnl7T53n8LvvvmsYhmH85z//MSQZP/vZz4yhQ4e6jrv44ouN/v37u37+7W9/a0gyPvzwQ9e2iooKIyMjw+jWrZvrc+c8X7t3717n8/b6668bkoy1a9fWeS26du1qSDI2bNjg2lZUVGTEx8cbt912m6nXEwDokQKA/xk3bpxycnJ08cUXa+fOnXrsscc0fvx4derUyW0Y0ptvvimHw6H77rtPMTHuv0ZtNpvr/y1atHD9v6KiQocOHdLIkSN19OhRffbZZ363c8eOHdq3b59+8YtfqLi4WIcOHdKhQ4d05MgRnX/++dqwYYMcDofbbX79619beoxf/vKXat++vTp27KiJEyfqyJEjeuWVVzRo0CCPxxcUFGjHjh265pprlJKS4tp+1llnady4cXr33XetP1FJH3/8sYqKivSb3/xGCQkJru0TJ05Uz5496wz1smLJkiXq0KGDzjvvPEnV790VV1yhZcuWuQ2/zM7OVlxcnFtPXkxMjGbPnu12fyUlJVqzZo1+/vOfu97vQ4cOqbi4WOPHj9e+ffvchiPWNmbMGLVr107Lly93bfvhhx+0evVqXXHFFa5tycnJ+vbbb+v0nFlRs1fqzTff9HiMP+dZbf3791erVq20YcMGSdU9T6eddpquvvpqbd++XUePHpVhGNq4caNGjhzput27776rIUOGuA1DbdWqla6//np9+eWXysvLc3ucGTNmuH3ezMjMzHR7zPbt2+snP/lJwKpTAmj6CFIAUMPgwYO1YsUK/fDDD8rNzdXdd9+tiooKXX755a6Lt/379ysmJkaZmZk+72v37t269NJLZbfblZSUpPbt22v69OmSpLKyMr/buG/fPknVF4/t27d3+/fnP/9Zx48fr3P/GRkZlh7jvvvu0+rVq7VmzRr95z//0YEDB3TVVVd5Pf6rr76SJP3kJz+ps69Xr16uC3C
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Имеется смещение в меньшую сторону, в том числе и медианное\n",
"df_cleaned = df.dropna()\n",
"plt.figure(figsize=(10, 6))\n",
"plt.scatter(df[\"Age\"], df[\"Networth\"])\n",
"plt.xlabel(\"Age\")\n",
"plt.ylabel(\"Networth\")\n",
"plt.title(\"Scatter Plot of Age vs Networth\")\n",
"plt.show()\n",
"\n",
"# уберем шумы\n",
"\n",
"# Статистический анализ для определения выбросов\n",
"Q1 = df[\"Age\"].quantile(0.25)\n",
"Q3 = df[\"Age\"].quantile(0.75)\n",
"IQR = Q3 - Q1\n",
"\n",
"# Определение порога для выбросов\n",
"threshold = 1.5 * IQR\n",
"outliers = (df[\"Age\"] < (Q1 - threshold)) | (\n",
" df[\"Age\"] > (Q3 + threshold)\n",
")\n",
"\n",
"# Вывод выбросов\n",
"print(\"Выбросы:\")\n",
"print(df[outliers])\n",
"\n",
"# Обработка выбросов\n",
"# В данном случае мы занулим выбросы\n",
"median_charge = df[\"Age\"].median()\n",
"df.loc[outliers, \"Age\"] = 0\n",
"\n",
"\n",
"# Визуализация данных после обработки\n",
"plt.figure(figsize=(10, 6))\n",
"plt.scatter(df[\"Age\"], df[\"Networth\"])\n",
"plt.xlabel(\"Age\")\n",
"plt.ylabel(\"Networth\")\n",
"plt.title(\"Scatter Plot of Age vs Networth\")\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Разбиение набора данных на обучающую, контрольную и тестовую выборки"
]
},
{
"cell_type": "code",
"execution_count": 270,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Размер обучающей выборки: 1560\n",
"Размер контрольной выборки: 520\n",
"Размер тестовой выборки: 520\n"
]
}
],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"# Разделение на обучающую и тестовую выборки\n",
"train_df, test_df = train_test_split(df_cleaned, test_size=0.2, random_state=42)\n",
"\n",
"# Разделение обучающей выборки на обучающую и контрольную\n",
"train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)\n",
"\n",
"print(\"Размер обучающей выборки:\", len(train_df))\n",
"print(\"Размер контрольной выборки:\", len(val_df))\n",
"print(\"Размер тестовой выборки:\", len(test_df))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Видим недостаток баланса:"
]
},
{
"cell_type": "code",
"execution_count": 271,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение Age в обучающей выборке:\n",
"Age\n",
"64 95\n",
"66 53\n",
"58 51\n",
"59 50\n",
"56 47\n",
" ..\n",
"98 2\n",
"30 1\n",
"29 1\n",
"27 1\n",
"25 1\n",
"Name: count, Length: 73, dtype: int64\n",
"\n",
"Распределение Age в контрольной выборке:\n",
"Age\n",
"64 27\n",
"54 23\n",
"60 17\n",
"57 15\n",
"81 15\n",
" ..\n",
"27 1\n",
"32 1\n",
"29 1\n",
"19 1\n",
"42 1\n",
"Name: count, Length: 66, dtype: int64\n",
"\n",
"Распределение Age в тестовой выборке:\n",
"Age\n",
"64 30\n",
"68 24\n",
"72 22\n",
"65 22\n",
"58 18\n",
" ..\n",
"100 1\n",
"88 1\n",
"93 1\n",
"91 1\n",
"33 1\n",
"Name: count, Length: 62, dtype: int64\n",
"\n"
]
}
],
"source": [
"def check_balance(df, name):\n",
" counts = df['Age'].value_counts()\n",
" print(f\"Распределение Age в {name}:\")\n",
" print(counts)\n",
" print()\n",
"\n",
"check_balance(train_df, \"обучающей выборке\")\n",
"check_balance(val_df, \"контрольной выборке\")\n",
"check_balance(test_df, \"тестовой выборке\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Используем oversample"
]
},
{
"cell_type": "code",
"execution_count": 272,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение Age в обучающей выборке после oversampling:\n",
"Age\n",
"59 95\n",
"70 95\n",
"71 95\n",
"81 95\n",
"67 95\n",
" ..\n",
"94 95\n",
"29 95\n",
"96 95\n",
"27 95\n",
"25 95\n",
"Name: count, Length: 73, dtype: int64\n",
"\n",
"Распределение Age в контрольной выборке после oversampling:\n",
"Age\n",
"57 27\n",
"69 27\n",
"72 27\n",
"64 27\n",
"54 27\n",
" ..\n",
"29 27\n",
"38 27\n",
"19 27\n",
"89 27\n",
"42 27\n",
"Name: count, Length: 66, dtype: int64\n",
"\n",
"Распределение Age в тестовой выборке после oversampling:\n",
"Age\n",
"68 30\n",
"70 30\n",
"76 30\n",
"74 30\n",
"64 30\n",
" ..\n",
"42 30\n",
"88 30\n",
"93 30\n",
"91 30\n",
"33 30\n",
"Name: count, Length: 62, dtype: int64\n",
"\n"
]
}
],
"source": [
"from imblearn.over_sampling import RandomOverSampler\n",
"\n",
"def oversample(df):\n",
" X = df.drop('Age', axis=1)\n",
" y = df['Age']\n",
" \n",
" oversampler = RandomOverSampler(random_state=42)\n",
" X_resampled, y_resampled = oversampler.fit_resample(X, y) # type: ignore\n",
" \n",
" resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
" return resampled_df\n",
"\n",
"train_df_oversampled = oversample(train_df)\n",
"val_df_oversampled = oversample(val_df)\n",
"test_df_oversampled = oversample(test_df)\n",
"\n",
"check_balance(train_df_oversampled, \"обучающей выборке после oversampling\")\n",
"check_balance(val_df_oversampled, \"контрольной выборке после oversampling\")\n",
"check_balance(test_df_oversampled, \"тестовой выборке после oversampling\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 100 Highest-Valued Unicorns\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"https://www.kaggle.com/datasets/ankanhore545/100-highest-valued-unicorns Самые взлетевшие компании.\n",
"Цель: создать свою супер-компанию\n",
"Входные данные: Название компании, оценочная стоимость, страна, штат, город, индустрия, год основания, имя основателя, количество работников"
]
},
{
"cell_type": "code",
"execution_count": 273,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" id gender age hypertension heart_disease ever_married \\\n",
"0 9046 Male 67 0 1 Yes \n",
"1 51676 Female 61 0 0 Yes \n",
"2 31112 Male 80 0 1 Yes \n",
"3 60182 Female 49 0 0 Yes \n",
"4 1665 Female 79 1 0 Yes \n",
"... ... ... ... ... ... ... \n",
"5105 18234 Female 80 1 0 Yes \n",
"5106 44873 Female 81 0 0 Yes \n",
"5107 19723 Female 35 0 0 Yes \n",
"5108 37544 Male 51 0 0 Yes \n",
"5109 44679 Female 44 0 0 Yes \n",
"\n",
" work_type Residence_type avg_glucose_level bmi smoking_status \\\n",
"0 Private Urban 228.69 36.6 formerly smoked \n",
"1 Self-employed Rural 202.21 NaN never smoked \n",
"2 Private Rural 105.92 32.5 never smoked \n",
"3 Private Urban 171.23 34.4 smokes \n",
"4 Self-employed Rural 174.12 24.0 never smoked \n",
"... ... ... ... ... ... \n",
"5105 Private Urban 83.75 NaN never smoked \n",
"5106 Self-employed Urban 125.20 40.0 never smoked \n",
"5107 Self-employed Rural 82.99 30.6 never smoked \n",
"5108 Private Rural 166.29 25.6 formerly smoked \n",
"5109 Govt_job Urban 85.28 26.2 Unknown \n",
"\n",
" stroke \n",
"0 1 \n",
"1 1 \n",
"2 1 \n",
"3 1 \n",
"4 1 \n",
"... ... \n",
"5105 0 \n",
"5106 0 \n",
"5107 0 \n",
"5108 0 \n",
"5109 0 \n",
"\n",
"[5110 rows x 12 columns]\n",
"Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',\n",
" 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',\n",
" 'smoking_status', 'stroke'],\n",
" dtype='object')\n"
]
}
],
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"df = pd.read_csv(\".//static//csv//healthcare-dataset-stroke-data.csv\", sep=\",\")\n",
"\n",
"df[\"age\"] = df[\"age\"].astype(int)\n",
"print(df)\n",
"df[\"age\"].dtype\n",
"\n",
"print(df.columns)"
]
},
{
"cell_type": "code",
"execution_count": 274,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAx8AAAIQCAYAAADghdPEAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAApkklEQVR4nO3de3SU9Z348U9CIASEACKXKCBwRBRBragN6FortqtIXa21tlgvpdVVXNH2aLWooNXVqq2rVlH2WLEV3dVWa70dpaKw1oioWMQLoCJeAlJEQgQilzy/P/rLlMgtRPjGJK/XORwzz3wnz2eY58S8mXlm8rIsywIAAGAHy2/oAQAAgOZBfAAAAEmIDwAAIAnxAQAAJCE+AACAJMQHAACQhPgAAACSEB8AAEAS4gMAAEhCfAAAAEmID4Bm4N133428vLyYNGlSQ4/SICZNmhR5eXnx4osv7vB9jR8/PvLy8nb4fgAaI/EBsA1qfond8E+XLl3i8MMPj8cffzz5PM8880ytWVq2bBl9+vSJU045Jd55553tso/nnnsuxo8fH8uXL98u3w+A5kt8ANTDFVdcEb///e/jd7/7XVx44YXx97//PY4++uh45JFHGmSec889N37/+9/HxIkTY/jw4fG///u/ceCBB0Z5efkX/t7PPfdcXH755eKjji655JJYvXp1Q48B8KVU0NADADRGRx11VAwePDh3edSoUdG1a9e4995745hjjkk+z6GHHhonnHBCREScfvrp0a9fvzj33HPjrrvuiosvvjj5PM1ZQUFBFBT43yvApnjmA2A76NChQxQVFW30S+fKlSvjpz/9afTo0SMKCwtjzz33jOuvvz6yLIuIiNWrV0f//v2jf//+tf61fNmyZdG9e/cYMmRIrF+/fpvn+frXvx4REQsWLNjiuqlTp8ahhx4abdu2jQ4dOsSxxx4bb7zxRu768ePHxwUXXBAREb179869vOvdd9/d6gyff3lazZ9nnnlmo7WnnXbaJteOHz++1ro//OEPMXjw4GjXrl2tdddff/1W54mIWLVqVZx55pmx8847R/v27eOUU06JTz75pNaa3XffPY455ph45plnYvDgwVFUVBQDBw7Mzf3AAw/EwIEDo3Xr1nHAAQfErFmzat3eOR8Am+efZgDqoaKiIpYuXRpZlsWSJUvi5ptvjk8//TROPvnk3Josy+Jb3/pWPP300zFq1KjYb7/94oknnogLLrggPvzww7jhhhuiqKgo7rrrrhg6dGiMHTs2fv3rX0dExOjRo6OioiImTZoULVq02Ob53n777YiI2HnnnTe75i9/+UscddRR0adPnxg/fnysXr06br755hg6dGi8/PLLsfvuu8fxxx8f8+bNi3vvvTduuOGG6Ny5c0RE7LLLLnWa48gjj4xTTjklIiJmzpwZN91002bXdu7cOW644Ybc5R/84Ae1ri8rK4sTTzwx9t1337jmmmuiuLg4li5dGueff36dZomIOOecc6JDhw4xfvz4mDt3bkyYMCEWLlyYO3emxltvvRXf//7348wzz4yTTz45rr/++hgxYkTcdttt8fOf/zzOPvvsiIi4+uqr48QTT4y5c+dGfr5/zwPYqgyAOrvzzjuziNjoT2FhYTZp0qRaa//0pz9lEZFdeeWVtbafcMIJWV5eXvbWW2/ltl188cVZfn5+Nn369Oz+++/PIiL7r//6r63O8/TTT2cRkf32t7/N/v73v2fl5eXZo48+mu2+++5ZXl5eNnPmzCzLsmzBggVZRGR33nln7rb77bdf1qVLl+zjjz/Obfvb3/6W5efnZ6ecckpu23XXXZdFRLZgwYI6/z2tWbMmi4jsnHPOyW2ruV9PP/30RutHjhyZ9e7du9a2iMjGjRuXu3zxxRdnEZEtWrQot63mfl133XVbnKfmcTvggAOyNWvW5LZfe+21WURkDz30UG5br169sojInnvuudy2J554IouIrKioKFu4cGFu++23377RfRo3blzmf68Am+afaQDq4ZZbbokpU6bElClT4u67747DDz88fvSjH8UDDzyQW/PYY49FixYt4txzz61125/+9KeRZVmtd8caP358DBgwIE499dQ4++yz47DDDtvodlvywx/+MHbZZZcoKSmJ4cOHx8qVK+Ouu+6qdV7KhhYtWhSvvPJKnHbaadGpU6fc9kGDBsWRRx4Zjz32WJ33vSlVVVUREdG6des6rV+zZk0UFhZucU1lZWXk5+dHhw4d6j3XGWecES1btsxdPuuss6KgoGCj+7v33ntHaWlp7vLBBx8cEf94OVvPnj032r693lkMoKkTHwD1cNBBB8WwYcNi2LBhMXLkyHj00Udj7733jnPOOSfWrFkTERELFy6MkpKSaNeuXa3b7rXXXrnra7Rq1Sp++9vfxoIFC6KysjLuvPPObTpv4LLLLospU6bE1KlTY/bs2VFeXr7Ry5Y2VLPvPffcc6Pr9tprr1i6dGmsXLmyzvv/vKVLl0ZERHFxcZ3WL1++PHbaaactriktLY3q6uoYM2ZMvP3227F06dKNztfYmj322KPW5Z122im6d+++0TksGwZGxD/vR48ePTa5fVvnAGiunPMBsB3k5+fH4YcfHjfeeGPMnz8/BgwYsM3f44knnoiIfzxrMH/+/Ojdu3edbztw4MAYNmzYNu9zR6n5ZX733Xev0/rFixdHr169trjmpJNOipdffjluvvnmmDhx4heccMs2d57N5rZn//8NBADYMs98AGwn69ati4iITz/9NCIievXqFeXl5VFZWVlr3Ztvvpm7vsbs2bPjiiuuiNNPPz3233//+NGPfhQVFRU7bNaafc+dO3ej6958883o3LlztG3bNiKiXu/cVPNJ4pt72deG1q5dG2+99VbuGaHNyc/Pj+uvvz6+9rWvxR577JF7ydu2mD9/fq3Ln376aSxatKjOkQTAFyM+ALaDtWvXxpNPPhmtWrXK/RJ99NFHx/r16+M3v/lNrbU33HBD5OXlxVFHHZW77WmnnRYlJSVx4403xqRJk+Kjjz7apndx2lbdu3eP/fbbL+66665aHx44Z86cePLJJ+Poo4/ObauJkG35kME//OEPseeee0b//v23uvahhx6K1atX594eeEtuvvnmmDp1akyePDmGDRsWQ4cOrfNMERETJ06MtWvX5i5PmDAh1q1bl3ssANixvOwKoB4ef/zx3DMYS5YsiXvuuSfmz58fF110UbRv3z4iIkaMGBGHH354jB07Nt59993Yd99948knn4yHHnoozjvvvOjbt29ERFx55ZXxyiuvxFNPPRXt2rWLQYMGxWWXXRaXXHJJnHDCCbVCYHu67rrr4qijjorS0tIYNWpU7q12i4uLa32+xgEHHBAREWPHjo2TTjopWrZsGSNGjMhFyYbeeeeduPbaa+OFF16I448/vtYzEzNnzoyIiClTpkTPnj2jW7duMW7cuLj11ltjyJAh8Y1vfGOL87722mtx4YUXxvjx4+PAAw+s131es2ZNHHHEEbm3x7311lvjkEMOiW9961v1+n4AbKOGfrstgMZkU2+127p162y//fbLJkyYkFVXV9daX1lZmZ1//vlZSUlJ1rJly2yPPfbIrrvuuty6l156KSsoKMj+4z/+o9bt1q1blx144IFZSUlJ9sknn2x2npq32r3//vu3OPem3mo3y7LsL3/5SzZ06NCsqKgoa9++fTZixIjs9ddf3+j2v/jFL7Jdd901y8/P3+Lb7m7urYg//+fOO+/MPvjgg6xHjx7Zeeedl1VUVGz0vWKDt9qtqqrKBg0alB1yyCHZunXrNrpfdX2r3WnTpmVnnHFG1rFjx2ynnXbKRo4cWeuthrPsH2+1O3z48E3OM3r06FrbNrV/b7ULsHl5WeYsOQC2j0mTJsX48eO3+AnoX/va1+K0006L0047LdlcAHw5OOcDAABIQnwAsN307ds3jjvuuC2uOfLII3PnuwDQvHjZFQAAkIRnPgAAgCTEBwAAkES9P+ejuro6ysvLo127dvX69FsAAKBpyLIsKisro6SkJPLzN//8Rr3jo7y8PHr06FHfmwMAAE3
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"\n",
"plt.figure(figsize=(10, 6))\n",
"sns.boxplot(x=df[\"bmi\"])\n",
"plt.title('Box Plot для bmi')\n",
"plt.xlabel('')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 275,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA1IAAAIjCAYAAAAJLyrXAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAADsK0lEQVR4nOzdeVxU9foH8M8MDDsMIOmgmSCuhEqaphezRE2UXLJNba+fbdqi95ZZWZqV2Wbeq9XNFkuvS7vmVhCWSZjljlgqgZaCCiggO8z5/UFnmuUs37PMAjzv16vX6wpnznznzBnu95nv830eA8dxHAghhBBCCCGEMDN6ewCEEEIIIYQQ0tJQIEUIIYQQQgghClEgRQghhBBCCCEKUSBFCCGEEEIIIQpRIEUIIYQQQgghClEgRQghhBBCCCEKUSBFCCGEEEIIIQpRIEUIIYQQQgghClEgRQghhBBCCCEKUSBFCCEtQGFhIQwGA1asWOHtoTjYunUrkpOTERQUBIPBgPPnz3t7SEQn3333HQwGA7777jtvD8Wj5s2bB4PB4O1hAPCtsRBCXFEgRQjxqoMHD+KGG25Aly5dEBQUhE6dOmHUqFH4z3/+47bnXL16Nd544w2Xn586dQrz5s3Dvn373PbczvjJKv+fyWRC165dcfvtt+P333/X5Tl+/PFHzJs3T/cgp7S0FDfddBOCg4OxbNkyrFy5EqGhoYLHrlixwuF1GgwGtG/fHsOHD8eWLVtcjueP+b//+z/B8z311FO2Y0pKSmw/v/POOxEWFqbPCySEEEIk+Ht7AISQtuvHH3/E8OHDcckll2DatGmwWCz4448/sHPnTixZsgQPPfSQW5539erVyM3NxaOPPurw81OnTmH+/PmIi4tDcnKyW55bzMMPP4yBAweioaEBe/bswTvvvINNmzbh4MGD6Nixo6Zz//jjj5g/fz7uvPNOREZG6jNgAD///DMqKyuxYMECjBw5kukxzz33HOLj48FxHE6fPo0VK1Zg7Nix+Oqrr3Dttdc6HBsUFITPPvsMb775JgICAhx+t2bNGgQFBaG2tla310MIIYQoQYEUIcRrXnjhBZjNZvz8888uE/wzZ854Z1BuUFVVJbpSw7vyyitxww03AADuuusu9OjRAw8//DA+/PBDzJkzxxPDVIx/j5QEZ2PGjMHll19u+/c999yDDh06YM2aNS6BVFpaGjZs2IAtW7ZgwoQJtp//+OOPKCgowPXXX4/PPvtM24sghBBCVKLUPkKI1+Tn5+PSSy8VnIi3b9/e5WerVq3CoEGDEBISgqioKAwbNgzffPON7ffr169Heno6OnbsiMDAQCQkJGDBggVoamqyHXP11Vdj06ZNOH78uC01LC4uDt999x0GDhwIoDmQ4X9nvyfpp59+QlpaGsxmM0JCQnDVVVchOzvbYYz8noa8vDxMnToVUVFRGDp0qOJrk5qaCgAoKCiQPC4rKwtXXnklQkNDERkZiQkTJuDw4cMO43nssccAAPHx8bbXVVhYKHneTz75BAMGDEBwcDBiYmJw66234uTJk7bfX3311bjjjjsAAAMHDoTBYMCdd96p+HVGRkYiODgY/v6u3+t16tQJw4YNw+rVqx1+/r///Q99+vRBUlKS4ucDgFdffRUGgwHHjx93+d2cOXMQEBCAc+fOAQCOHj2K66+/HhaLBUFBQbj44osxefJklJeXSz7HDz/8gBtvvBGXXHIJAgMD0blzZ8ycORM1NTWqxgEAy5YtQ9euXREcHIxBgwbhhx9+wNVXX42rr75a0eu3Wq2YN28eOnbsiJCQEAwfPhx5eXmIi4uTfQ/FjhEaR21tLebNm4cePXogKCgIsbGxmDRpEvLz823HVFVV4Z///Cc6d+6MwMBA9OzZE6+++io4jnM4V0ZGBoYOHYrIyEiEhYWhZ8+eePLJJx2Oqaurw7PPPotu3brZrvnjjz+Ouro6RddHzKpVq2yfiejoaEyePBl//PGH7fczZsxAWFgYqqurXR47ZcoUWCwWh79FW7ZssX12w8PDkZ6ejkOHDukyVkKIZ1AgRQjxmi5dumD37t3Izc2VPXb+/Pm47bbbYDKZ8Nxzz2H+/Pno3LkzsrKybMesWLECYWFhmDVrFpYsWYIBAwbgmWeewRNPPGE75qmnnkJycjJiYmKwcuVKrFy5Em+88QZ69+6N5557DgBw77332n43bNgwAM0By7Bhw1BRUYFnn30WL774Is6fP4/U1FTs2rXLZbw33ngjqqur8eKLL2LatGmKrw0/2WzXrp3oMZmZmRg9ejTOnDmDefPmYdasWfjxxx+RkpJiC5QmTZqEKVOmAAAWL15se10XXXSR6HlXrFiBm266CX5+fli4cCGmTZuGzz//HEOHDrXts3rqqadw7733AmhO11u5ciXuu+8+2ddVXl6OkpISnD17FocOHcIDDzyACxcu4NZbbxU8furUqfjqq69w4cIFAEBjYyM++eQTTJ06Vfa5xNx0000wGAz4+OOPXX738ccf45prrkFUVBTq6+sxevRo7Ny5Ew899BCWLVuGe++9F7///rvsfrNPPvkE1dXVeOCBB/Cf//wHo0ePxn/+8x/cfvvtiscBAG+99RZmzJiBiy++GC+//DKuvPJKTJw4EX/++afi1z9nzhzMnz8fl19+OV555RV0794do0ePRlVVleJziWlqasK1116L+fPnY8CAAXjttdfwyCOPoLy83PZ55zgO48ePx+LFi5GWlobXX38dPXv2xGOPPYZZs2bZznXo0CFce+21qKurw3PPPYfXXnsN48ePd/gSw2q1Yvz48Xj11Vcxbtw4/Oc//8HEiROxePFi3HzzzZpfzwsvvIDbb78d3bt3x+uvv45HH30U3377LYYNG2a7F26++WZUVVVh06ZNDo+trq7GV199hRtuuAF+fn4AgJUrVyI9PR1hYWFYtGgR5s6di7y8PAwdOlT2Sw5CiA/hCCHES7755hvOz8+P8/Pz44YMGcI9/vjj3Ndff83V19c7HHf06FHOaDRy1113HdfU1OTwO6vVavvf1dXVLs9x3333cSEhIVxtba3tZ+np6VyXLl1cjv355585ANwHH3zg8hzdu3fnRo8e7fJ88fHx3KhRo2w/e/bZZzkA3JQpU5iuwbZt2zgA3Pvvv8+dPXuWO3XqFLdp0yYuLi6OMxgM3M8//8xxHMcVFBS4jC05OZlr3749V1paavvZ/v37OaPRyN1+++22n73yyiscAK6goEB2PPX19Vz79u25pKQkrqamxvbzjRs3cgC4Z555xvazDz74gANgG6MU/ljn/wIDA7kVK1a4HA+Amz59OldWVsYFBARwK1eu5DiO4zZt2sQZDAausLDQdq3Pnj1re9wdd9zBhYaGyo5nyJAh3IABAxx+tmvXLg4A99FHH3Ecx3F79+7lAHCffPKJ7PmcCd2LCxcu5AwGA3f8+HFF46irq+PatWvHDRw4kGtoaLAdt2LFCg4Ad9VVVzGPq7i4mPP39+cmTpzo8PN58+ZxALg77rjD9jP+3ty2bZvtZ126dHE4hnfVVVc5jOP999/nAHCvv/66y7H8Z+jLL7/kAHDPP/+8w+9vuOEGzmAwcMeOHeM4juMWL17s8j47W7lyJWc0GrkffvjB4edvv/02B4DLzs4Wfawz/r7iFRYWcn5+ftwLL7zgcNzBgwc5f39/28+tVivXqVMn7vrrr3c47uOPP+YAcNu3b+c4juMqKyu5yMhIbtq0aQ7HFRcXc2az2eHnzmMhhPgWWpEihHjNqFGjkJOTg/Hjx2P//v14+eWXMXr0aHTq1AkbNmywHffll1/CarXimWeegdHo+GfLvjRwcHCw7X9XVlaipKQEV155Jaqrq/Hrr7+qHue+fftw9OhRTJ06FaWlpSgpKUFJSQmqqqowYsQIbN++HVar1eEx999/v6LnuPvuu3HRRRehY8eOSE9PR1VVFT788EOH/UT2ioqKsG/fPtx5552Ijo62/bxv374YNWoUNm/erPyFAvjll19w5swZPPjggwgKCrL9PD09Hb169XL5tl2pZcuWISMjAxkZGVi1ahWGDx+
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"\n",
"# Визуализация данных после обработки\n",
"plt.figure(figsize=(10, 6))\n",
"plt.scatter(df[\"bmi\"], df[\"avg_glucose_level\"])\n",
"plt.xlabel(\"bmi\")\n",
"plt.ylabel(\"avg_glucose_level\")\n",
"plt.title(\"Scatter Plot of BMI vs avg_glucose_level\")\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Удаление строк с пустыми значениями"
]
},
{
"cell_type": "code",
"execution_count": 276,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Выбросы:\n",
" id gender age hypertension heart_disease ever_married \\\n",
"21 13861 Female 52 1 0 Yes \n",
"113 41069 Female 45 0 0 Yes \n",
"254 32257 Female 47 0 0 Yes \n",
"258 28674 Female 74 1 0 Yes \n",
"270 72911 Female 57 1 0 Yes \n",
"... ... ... ... ... ... ... \n",
"4858 1696 Female 43 0 0 Yes \n",
"4906 72696 Female 53 0 0 Yes \n",
"4952 16245 Male 51 1 0 Yes \n",
"5009 40732 Female 50 0 0 Yes \n",
"5057 38349 Female 49 0 0 Yes \n",
"\n",
" work_type Residence_type avg_glucose_level bmi smoking_status \\\n",
"21 Self-employed Urban 233.29 48.9 never smoked \n",
"113 Private Rural 224.10 56.6 never smoked \n",
"254 Private Urban 210.95 50.1 Unknown \n",
"258 Self-employed Urban 205.84 54.6 never smoked \n",
"270 Private Rural 129.54 60.9 smokes \n",
"... ... ... ... ... ... \n",
"4858 Private Urban 100.88 47.6 smokes \n",
"4906 Private Urban 70.51 54.1 never smoked \n",
"4952 Self-employed Rural 211.83 56.6 never smoked \n",
"5009 Self-employed Rural 126.85 49.5 formerly smoked \n",
"5057 Govt_job Urban 69.92 47.6 never smoked \n",
"\n",
" stroke \n",
"21 1 \n",
"113 1 \n",
"254 0 \n",
"258 0 \n",
"270 0 \n",
"... ... \n",
"4858 0 \n",
"4906 0 \n",
"4952 0 \n",
"5009 0 \n",
"5057 0 \n",
"\n",
"[110 rows x 12 columns]\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA1IAAAIjCAYAAAAJLyrXAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOydeXwU5f3HP7shNzmJsAGBhIBABEQQBIMoESSAgoJVDg/UYlXwgFbxooJYEW1FWqj+RAQrBW8LCqJEUCQGsdwhKBATqJCgSSCB3GTn90eYZY85nmeO3dnk+3699EV2Z2eemXlmnu/9tQmCIIAgCIIgCIIgCIJgxh7oARAEQRAEQRAEQQQbpEgRBEEQBEEQBEFwQooUQRAEQRAEQRAEJ6RIEQRBEARBEARBcEKKFEEQBEEQBEEQBCekSBEEQRAEQRAEQXBCihRBEARBEARBEAQnpEgRBEEQBEEQBEFwQooUQRAEQRAEQRAEJ6RIEQRBBAFFRUWw2WxYuXJloIfiwcaNG9G3b19ERETAZrPh9OnTgR4SYRBff/01bDYbvv7660APxa/MnTsXNpst0MMAYK2xEAThCylSBEEElP379+OWW25B586dERERgQ4dOmDEiBH4xz/+YdoxV69ejVdffdXn8xMnTmDu3LnYs2ePacf2RhRWxf9CQ0PRpUsX3Hnnnfj5558NOcZ3332HuXPnGq7klJWV4dZbb0VkZCSWLl2Kd955B9HR0ZLbrly50uM8bTYb2rZti2HDhuHzzz/32V7c5ve//73k/p5++mnXNqWlpa7Pp06ditatWxtzggRBEAShQKtAD4AgiJbLd999h2HDhqFTp06YNm0aHA4H/ve//2H79u1YvHgxHnroIVOOu3r1auTl5eHRRx/1+PzEiROYN28eUlJS0LdvX1OOLcfDDz+MAQMGoKGhAbt27cIbb7yB9evXY//+/Wjfvr2ufX/33XeYN28epk6divj4eGMGDOCHH37AmTNnMH/+fAwfPpzpN8899xxSU1MhCAJOnjyJlStXYvTo0fj0009xww03eGwbERGBjz76CP/85z8RFhbm8d2aNWsQERGB2tpaw86HIAiCIHggRYogiIDxl7/8BXFxcfjhhx98BPxff/01MIMygaqqKllPjcjVV1+NW265BQBw991345JLLsHDDz+Mt99+G08++aQ/hsmNeI94lLNRo0bhiiuucP197733ol27dlizZo2PIpWVlYV169bh888/x7hx41yff/fddygsLMSECRPw0Ucf6TsJgiAIgtAIhfYRBBEwCgoKcOmll0oK4m3btvX5bNWqVRg4cCCioqKQkJCAoUOH4ssvv3R9v3btWowZMwbt27dHeHg40tLSMH/+fDQ2Nrq2ufbaa7F+/XocPXrUFRqWkpKCr7/+GgMGDADQpMiI37nnJH3//ffIyspCXFwcoqKicM011yAnJ8djjGJOQ35+PiZPnoyEhAQMGTKE+9pkZmYCAAoLCxW327x5M66++mpER0cjPj4e48aNw8GDBz3G89hjjwEAUlNTXedVVFSkuN8PPvgA/fv3R2RkJJKSknD77bfj+PHjru+vvfZa3HXXXQCAAQMGwGazYerUqdznGR8fj8jISLRq5WvX69ChA4YOHYrVq1d7fP7vf/8bvXv3Rq9evbiPBwB//etfYbPZcPToUZ/vnnzySYSFheHUqVMAgMOHD2PChAlwOByIiIjAxRdfjIkTJ6KiokLxGN9++y1+97vfoVOnTggPD0fHjh0xc+ZM1NTUaBoHACxduhRdunRBZGQkBg4ciG+//RbXXnstrr32Wq7zdzqdmDt3Ltq3b4+oqCgMGzYM+fn5SElJUb2HcttIjaO2thZz587FJZdcgoiICCQnJ2P8+PEoKChwbVNVVYU//vGP6NixI8LDw9G9e3f89a9/hSAIHvvatGkThgwZgvj4eLRu3Rrdu3fHU0895bFNXV0dnn32WXTt2tV1zR9//HHU1dVxXR85Vq1a5XomEhMTMXHiRPzvf/9zfT9jxgy0bt0a1dXVPr+dNGkSHA6Hx7vo888/dz27MTExGDNmDA4cOGDIWAmC8A+kSBEEETA6d+6MnTt3Ii8vT3XbefPm4Y477kBoaCiee+45zJs3Dx07dsTmzZtd26xcuRKtW7fGrFmzsHjxYvTv3x9//vOf8cQTT7i2efrpp9G3b18kJSXhnXfewTvvvINXX30VPXv2xHPPPQcAuO+++1zfDR06FECTwjJ06FBUVlbi2WefxQsvvIDTp08jMzMTO3bs8Bnv7373O1RXV+OFF17AtGnTuK+NKGy2adNGdpvs7GyMHDkSv/76K+bOnYtZs2bhu+++Q0ZGhktRGj9+PCZNmgQAWLRokeu8LrroItn9rly5ErfeeitCQkKwYMECTJs2DR9//DGGDBniyrN6+umncd999wFoCtd755138Ic//EH1vCoqKlBaWorffvsNBw4cwAMPPICzZ8/i9ttvl9x+8uTJ+PTTT3H27FkAwLlz5/DBBx9g8uTJqseS49Zbb4XNZsP777/v893777+P66+/HgkJCaivr8fIkSOxfft2PPTQQ1i6dCnuu+8+/Pzzz6r5Zh988AGqq6vxwAMP4B//+AdGjhyJf/zjH7jzzju5xwEAr732GmbMmIGLL74YL730Eq6++mrcdNNN+OWXX7jP/8knn8S8efNwxRVX4OWXX0a3bt0wcuRIVFVVce9LjsbGRtxwww2YN28e+vfvj7/97W945JFHUFFR4XreBUHA2LFjsWjRImRlZeGVV15B9+7d8dhjj2HWrFmufR04cAA33HAD6urq8Nxzz+Fvf/sbxo4d62HEcDqdGDt2LP7617/ixhtvxD/+8Q/cdNNNWLRoEW677Tbd5/OXv/wFd955J7p164ZXXnkFjz76KL766isMHTrUNRduu+02VFVVYf369R6/ra6uxqeffopbbrkFISEhAIB33nkHY8aMQevWrbFw4ULMmTMH+fn5GDJkiKqRgyAICyEQBEEEiC+//FIICQkRQkJChMGDBwuPP/648MUXXwj19fUe2x0+fFiw2+3CzTffLDQ2Nnp853Q6Xf+urq72OcYf/vAHISoqSqitrXV9NmbMGKFz584+2/7www8CAGHFihU+x+jWrZswcuRIn+OlpqYKI0aMcH327LPPCgCESZMmMV2DLVu2CACEt956S/jtt9+EEydOCOvXrxdSUlIEm80m/PDDD4IgCEJhYaHP2Pr27Su0bdtWKCsrc322d+9ewW63C3feeafrs5dfflkAIBQWFqqOp76+Xmjbtq3Qq1cvoaamxvX5Z599JgAQ/vznP7s+W7FihQDANUYlxG29/wsPDxdWrlzpsz0AYfr06UJ5ebkQFhYmvPPOO4IgCML69esFm80mFBUVua71b7/95vrdXXfdJURHR6uOZ/DgwUL//v09PtuxY4cAQPjXv/4lCIIg7N69WwAgfPDBB6r780ZqLi5YsECw2WzC0aNHucZRV1cntGnTRhgwYIDQ0NDg2m7lypUCAOGaa65hHldJSYnQqlUr4aabbvL4fO7cuQIA4a677nJ9Js7NLVu2uD7r3LmzxzYi11xzjcc43nrrLQGA8Morr/hsKz5D//nPfwQAwvPPP+/x/S233CLYbDbhyJEjgiAIwqJFi3zuszfvvPOOYLfbhW+//dbj89dff10AIOTk5Mj+1htxXokUFRUJISEhwl/+8heP7fbv3y+0atXK9bnT6RQ6dOggTJgwwWO7999/XwAgbN26VRAEQThz5owQHx8vTJs2zWO7kpISIS4uzuNz77EQBGEtyCNFEETAGDFiBHJzczF27Fjs3bsXL730EkaOHIkOHTpg3bp1ru3+85//wOl04s9//jPsds/Xlntp4MjISNe/z5w5g9LSUlx99dWorq7Gjz/+qHmce/bsweHDhzF58mSUlZWhtLQUpaWlqKqqwnXXXYetW7fC6XR6/Ob+++/nOsY999yDiy66CO3bt8eYMWNQVVWFt99+2yOfyJ3i4mLs2bMHU6dORWJiouvzPn36YMSIEdiwYQP/iQL473//i19//RU
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df_cleaned = df.dropna()\n",
"df_cleaned = df_cleaned.loc[df_cleaned[\"bmi\"] != \"N/A\"]\n",
"# уберем шумы\n",
"\n",
"# Статистический анализ для определения выбросов\n",
"Q1 = df[\"bmi\"].quantile(0.25)\n",
"Q3 = df[\"bmi\"].quantile(0.75)\n",
"IQR = Q3 - Q1\n",
"\n",
"# Определение порога для выбросов\n",
"threshold = 1.5 * IQR\n",
"outliers = (df[\"bmi\"] < (Q1 - threshold)) | (df[\"bmi\"] > (Q3 + threshold))\n",
"\n",
"# Вывод выбросов\n",
"print(\"Выбросы:\")\n",
"print(df[outliers])\n",
"\n",
"# Обработка выбросов\n",
"# В данном случае мы занулим выбросы на медиану\n",
"median = df[\"bmi\"].median()\n",
"df.loc[outliers, \"bmi\"] = median\n",
"\n",
"\n",
"# Визуализация данных после обработки\n",
"plt.figure(figsize=(10, 6))\n",
"plt.scatter(df[\"bmi\"], df[\"avg_glucose_level\"])\n",
"plt.xlabel(\"bmi\")\n",
"plt.ylabel(\"avg_glucose_level\")\n",
"plt.title(\"Scatter Plot of BMI vs avg_glucose_level\")\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Разбиение набора данных на обучающую, контрольную и тестовую выборки"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Применение методов приращения данных (аугментации)"
]
},
{
"cell_type": "code",
"execution_count": 277,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Размер обучающей выборки: 2945\n",
"Размер контрольной выборки: 982\n",
"Размер тестовой выборки: 982\n",
"Распределение в обучающей выборке:\n",
"age\n",
"37 57\n",
"52 55\n",
"56 54\n",
"57 54\n",
"53 52\n",
" ..\n",
"72 20\n",
"68 20\n",
"7 17\n",
"4 14\n",
"6 13\n",
"Name: count, Length: 83, dtype: int64\n",
"\n",
"Распределение в контрольной выборке:\n",
"age\n",
"78 22\n",
"51 21\n",
"41 21\n",
"18 18\n",
"63 18\n",
" ..\n",
"9 6\n",
"10 5\n",
"12 5\n",
"74 4\n",
"67 2\n",
"Name: count, Length: 83, dtype: int64\n",
"\n",
"Распределение в тестовой выборке:\n",
"age\n",
"78 25\n",
"44 23\n",
"54 23\n",
"50 21\n",
"57 21\n",
" ..\n",
"11 6\n",
"76 5\n",
"7 5\n",
"77 4\n",
"6 4\n",
"Name: count, Length: 83, dtype: int64\n",
"\n",
"Распределение в обучающей выборке после oversampling:\n",
"age\n",
"32 57\n",
"81 57\n",
"42 57\n",
"31 57\n",
"23 57\n",
" ..\n",
"10 57\n",
"74 57\n",
"76 57\n",
"4 57\n",
"29 57\n",
"Name: count, Length: 83, dtype: int64\n",
"\n",
"Распределение в контрольной выборке после oversampling:\n",
"age\n",
"28 22\n",
"74 22\n",
"30 22\n",
"14 22\n",
"71 22\n",
" ..\n",
"80 22\n",
"18 22\n",
"82 22\n",
"65 22\n",
"67 22\n",
"Name: count, Length: 83, dtype: int64\n",
"\n",
"Распределение в тестовой выборке после oversampling:\n",
"age\n",
"80 25\n",
"42 25\n",
"66 25\n",
"29 25\n",
"47 25\n",
" ..\n",
"7 25\n",
"72 25\n",
"76 25\n",
"34 25\n",
"13 25\n",
"Name: count, Length: 83, dtype: int64\n",
"\n"
]
}
],
"source": [
"from imblearn.over_sampling import RandomOverSampler\n",
"\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"# Разделение на обучающую и тестовую выборки\n",
"train_df, test_df = train_test_split(df_cleaned, test_size=0.2, random_state=42)\n",
"\n",
"# Разделение обучающей выборки на обучающую и контрольную\n",
"train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)\n",
"\n",
"print(\"Размер обучающей выборки:\", len(train_df))\n",
"print(\"Размер контрольной выборки:\", len(val_df))\n",
"print(\"Размер тестовой выборки:\", len(test_df))\n",
"\n",
"def check_balance(df, name):\n",
" counts = df[\"age\"].value_counts()\n",
" print(f\"Распределение в {name}:\")\n",
" print(counts)\n",
" print()\n",
"\n",
"\n",
"check_balance(train_df, \"обучающей выборке\")\n",
"check_balance(val_df, \"контрольной выборке\")\n",
"check_balance(test_df, \"тестовой выборке\")\n",
"\n",
"def oversample(df):\n",
" X = df.drop(\"age\", axis=1)\n",
" y = df[\"age\"]\n",
"\n",
" oversampler = RandomOverSampler(random_state=42)\n",
" X_resampled, y_resampled = oversampler.fit_resample(X, y) # type: ignore\n",
"\n",
" resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
" return resampled_df\n",
"\n",
"\n",
"train_df_oversampled = oversample(train_df)\n",
"val_df_oversampled = oversample(val_df)\n",
"test_df_oversampled = oversample(test_df)\n",
"\n",
"check_balance(train_df_oversampled, \"обучающей выборке после oversampling\")\n",
"check_balance(val_df_oversampled, \"контрольной выборке после oversampling\")\n",
"check_balance(test_df_oversampled, \"тестовой выборке после oversampling\")"
]
},
{
"cell_type": "code",
"execution_count": 278,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение в обучающей выборке после oversampling:\n",
"age\n",
"32 57\n",
"81 57\n",
"42 57\n",
"31 57\n",
"23 57\n",
" ..\n",
"10 57\n",
"74 57\n",
"76 57\n",
"4 57\n",
"29 57\n",
"Name: count, Length: 83, dtype: int64\n",
"\n",
"Распределение в контрольной выборке после oversampling:\n",
"age\n",
"28 22\n",
"74 22\n",
"30 22\n",
"14 22\n",
"71 22\n",
" ..\n",
"80 22\n",
"18 22\n",
"82 22\n",
"65 22\n",
"67 22\n",
"Name: count, Length: 83, dtype: int64\n",
"\n",
"Распределение в тестовой выборке после oversampling:\n",
"age\n",
"80 25\n",
"42 25\n",
"66 25\n",
"29 25\n",
"47 25\n",
" ..\n",
"7 25\n",
"72 25\n",
"76 25\n",
"34 25\n",
"13 25\n",
"Name: count, Length: 83, dtype: int64\n",
"\n"
]
}
],
"source": [
"from imblearn.over_sampling import RandomOverSampler\n",
"\n",
"\n",
"def oversample(df):\n",
" X = df.drop(\"age\", axis=1)\n",
" y = df[\"age\"]\n",
"\n",
" oversampler = RandomOverSampler(random_state=42)\n",
" X_resampled, y_resampled = oversampler.fit_resample(X, y) # type: ignore\n",
"\n",
" resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
" return resampled_df\n",
"\n",
"\n",
"train_df_oversampled = oversample(train_df)\n",
"val_df_oversampled = oversample(val_df)\n",
"test_df_oversampled = oversample(test_df)\n",
"\n",
"check_balance(train_df_oversampled, \"обучающей выборке после oversampling\")\n",
"check_balance(val_df_oversampled, \"контрольной выборке после oversampling\")\n",
"check_balance(test_df_oversampled, \"тестовой выборке после oversampling\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "aisenv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}