1252 lines
437 KiB
Plaintext
1252 lines
437 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Customer Personality Analysis\n",
|
|||
|
"https://www.kaggle.com/datasets/imakash3011/customer-personality-analysis Набор представляет собой данные о покупателях\n",
|
|||
|
"Пример цели: Узнать, кто больше всего покупает продукцию (вино)\n",
|
|||
|
"Входные данные: год рождения, степень образования, статус отношений, сколько детей, сколько подростков, сколько было потрачено на вино"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 261,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Index(['work_year', 'experience_level', 'employment_type', 'job_title',\n",
|
|||
|
" 'salary', 'salary_currency', 'salary_in_usd', 'employee_residence',\n",
|
|||
|
" 'remote_ratio', 'company_location', 'company_size'],\n",
|
|||
|
" dtype='object')\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"df = pd.read_csv(\".//static//csv//ds_salaries.csv\")\n",
|
|||
|
"print(df.columns)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 262,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAx8AAAIjCAYAAABia6bHAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA7kklEQVR4nO3dd5hV1b0//s8MMAMIMwOIFKWIYgFB7EGsERsYSxRLiIKxxKhRE0tiQEFjvhpbEonGGK+i6JWrXlssESIiSQRUFBGwBiyRJjoUpTP794e/OZdDGQbUNYO8Xs8zD5y911nrs/dZz5nznrNLQZZlWQAAAHzDCmu6AAAAYPMgfAAAAEkIHwAAQBLCBwAAkITwAQAAJCF8AAAASQgfAABAEsIHAACQhPABAAAkIXwAfEXvv/9+FBQUxNChQ2u6lE3e0KFDo6CgIN5///2aLmWtBg8eHAUFBTVdxlfSvn376N+/f02XAWymhA+g1qj84Lnqz1ZbbRUHH3xwPPPMM8nrGT16dF4t9erViw4dOsRpp50W06ZN+1rGePHFF2Pw4MExb968r6U/AKjNhA+g1rn66qtj2LBhce+998Zll10Wn3zySfTq1SuefPLJGqnnggsuiGHDhsUdd9wRvXv3jv/5n/+JvfbaK2bMmPGV+37xxRfjqquuEj42EQMHDozFixfXdBkAm6y6NV0AwOqOPPLI2HPPPXOPzzjjjGjRokU88MADcdRRRyWvZ//9948TTjghIiJOP/302GGHHeKCCy6Ie+65Jy6//PLk9VB9ixYtioYNG35t/dWtWzfq1vWrE2Bj+eYDqPXKysqiQYMGa3zo++KLL+Liiy+ONm3aRHFxcey4445x4403RpZlERGxePHi2GmnnWKnnXbK+2v1Z599Fq1atYp99903Vq5cucH1fPe7342IiOnTp1fZbtSoUbH//vvHFltsEWVlZXHMMcfEm2++mVs/ePDguPTSSyMiYtttt80d3lWd8x1WPzyt8mf06NFrtO3fv/9a2w4ePDiv3cMPPxx77rlnNG7cOK/djTfeWGUty5cvj6uuuio6duwY9evXj2bNmsV+++0XI0eOzLWZNGlS9O/fPzp06BD169ePli1bxo9+9KP49NNP17utjz/+ePTu3Ttat24dxcXFsd1228Wvf/3rNV67gw46KHbZZZeYMGFCHHDAAdGwYcP41a9+Ff369Ystt9wyli9fvkbfhx12WOy4447rraHS2s75KCgoiPPPPz8ee+yx2GWXXaK4uDg6d+4cf/vb36rdb8T/Hea3+mu4tnOKZs2aFaeffnpss802UVxcHK1atYpjjjkmb+5kWRbXXHNNbLPNNtGwYcM4+OCDY8qUKRtUE8DXzZ9vgFpn/vz5MXfu3MiyLObMmRNDhgyJzz//PH74wx/m2mRZFkcffXQ8//zzccYZZ0S3bt3i2WefjUsvvTQ+/vjj+N3vfhcNGjSIe+65J3r06BEDBgyIm2++OSIizjvvvJg/f34MHTo06tSps8H1/fvf/46IiGbNmq2zzd///vc48sgjo0OHDjF48OBYvHhxDBkyJHr06BGvvvpqtG/fPr7//e/HO++8Ew888ED87ne/iy233DIiIpo3b16tOg499NA47bTTIiLi5ZdfjltuuWWdbbfccsv43e9+l3t86qmn5q0fO3ZsnHjiibHrrrvGddddF6WlpTF37tz42c9+tt46Bg8eHNdee22ceeaZsffee8eCBQvilVdeiVdffTUOPfTQiIgYOXJkTJs2LU4//fRo2bJlTJkyJe64446YMmVKjBs3rsqTuIcOHRqNGjWKn//859GoUaMYNWpUXHnllbFgwYK44YYb8tp++umnceSRR8bJJ58cP/zhD6NFixaxxRZbxL333hvPPvts3jdns2bNilGjRsWgQYPWu43r889//jMeeeSROPfcc6Nx48Zxyy23xPHHHx8ffvhhlfNkYx1//PExZcqU+OlPfxrt27ePOXPmxMiRI+PDDz+M9u3bR0TElVdeGddcc0306tUrevXqFa+++mocdthhsWzZsq+9HoBqywBqibvvvjuLiDV+iouLs6FDh+a1feyxx7KIyK655pq85SeccEJWUFCQvffee7lll19+eVZYWJiNGTMme+ihh7KIyH7/+9+vt57nn38+i4jsrrvuyj755JNsxowZ2VNPPZW1b98+KygoyF5++eUsy7Js+vTpWURkd999d+653bp1y7baaqvs008/zS17/fXXs8LCwuy0007LLbvhhhuyiMimT59e7f20bNmyLCKy888/P7escruef/75Ndr37ds323bbbfOWRUQ2aNCg3OPLL788i4hs5syZuWWV23XDDTdUWc+uu+6a9e7du8o2ixYtWmPZAw88kEVENmbMmNyyyjmw6v5Y23N//OMfZw0bNsyWLFmSW3bggQdmEZHdfvvteW1XrlyZbbPNNtlJJ52Ut/zmm2/OCgoKsmnTplVZ+6oGDRqUrf6rMyKyoqKivDn3+uuvZxGRDRkypNp9V8631V/D1edXeXn5el+XOXPmZEVFRVnv3r2zioqK3PJf/epXWURk/fr1q3ZdAF8nh10Btc6tt94aI0eOjJEjR8Z9990XBx98cJx55pnxyCOP5No8/fTTUadOnbjgggvynnvxxRdHlmV5V8caPHhwdO7cOfr16xfnnntuHHjggWs8ryo/+tGPonnz5tG6devo3bt3fPHFF3HPPffknZeyqpkzZ8bEiROjf//+0bRp09zyrl27xqGHHhpPP/10tcdemyVLlkRERP369avVftmyZVFcXFxlm4ULF0ZhYWGUlZVtcD1lZWUxZcqUePfdd9fZpkGDBrn/L1myJObOnRvf+c53IiLi1VdfrbL/VZ+7cOHCmDt3buy///6xaNGieOutt/LaFhcXx+mnn563rLCwMPr27RtPPPFELFy4MLf8/vvvj3333Te23Xbb9W/kevTs2TO222673OOuXbtGSUnJ13ZVtFU1aNAgioqKYvTo0VFeXr7WNn//+99j2bJl8dOf/jTvW6WLLrroa68HYEMIH0Cts/fee0fPnj2jZ8+e0bdv33jqqaeiU6dOcf755+cOGfnggw+idevW0bhx47zn7rzzzrn1lYqKiuKuu+6K6dOnx8KFC+Puu+/eoHs1XHnllTFy5MgYNWpUTJo0KWbMmLHGYUurqhx7becS7LzzzjF37tz44osvqj3+6ubOnRsREaWlpdVqP2/evGjUqFGVbbp37x4VFRVx4YUXxr///e+YO3fuOj/Yru7qq6+OefPmxQ477BBdunSJSy+9NCZNmpTX5rPPPosLL7wwWrRoEQ0aNIjmzZvnPvTPnz+/yv6nTJkSxx13XJSWlkZJSUk0b948dwje6s/deuuto6ioaI0+TjvttFi8eHE8+uijERHx9ttvx4QJE6p8HTdE27Zt11jWpEmTau/DDVFcXBy//e1v45lnnokWLVrEAQccENdff33MmjUr16ZyDnbs2DHvuc2bN48mTZp87TUBVJfwAdR6hYWFcfDBB8fMmTOr/Ot6VZ599tmI+PKv7hvaR5cuXaJnz55x8MEHR5cuXWr8akeVJxVXHtu/PrNmzYqWLVtW2ebkk0+Oiy++OIYOHRrbb799NG/ePHbfffdq9X/AAQfEv//977jrrrtil112iTvvvDN23333uPPOO3NtTjzxxPjLX/4S55xzTjzyyCMxYsSI3AnZFRUV6+x73rx5ceCBB8brr78eV199dfz1r3+NkSNHxm9/+9u1PnfVb0lW1alTp9hjjz3ivvvui4iI++67L4qKiuLEE0+s1jauz7rOHcr+/4sfVMe6AvHaLopw0UUXxTvvvBPXXntt1K9fP6644orYeeed47XXXqv2eAA1QfgANgkrVqyIiIjPP/88IiLatWsXM2bMyDuMJiJyh+G0a9cut2zSpElx9dVXx+mnnx677bZbnHnmmev9a/tXUTn222+/vca6t956K7bccsvYYostImLdHzir8sorr0RErPOwr1UtX7483nvvvdw3QutSWFgYN954Yxx00EHRsWPH3CFv1dW0adM4/fT
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Визуализация данных - ящик с усами. Как видим - выборка относительно сбалансирована, есть среднее смещение в среднюю сторону, медиана уравновешена\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"sns.boxplot(x=df[\"salary_in_usd\"])\n",
|
|||
|
"plt.title(\"Box Plot для salary_in_usd\")\n",
|
|||
|
"plt.xlabel(\"salary_in_usd\")\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 263,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA0sAAAIjCAYAAADSlID1AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABwG0lEQVR4nO3deXhU1f3H8c9MkpnsGxASICRBNsMii6AoixUsigtuFa2iWEW0WsTWDZeibRXFautScavi0l8FFyhWRFBxR3ABWYLIFjYhAbLvycz5/ZFmzJCbZDJMmADv1/PMo7lz7jnfe+bO8uHO3GszxhgBAAAAALzYg10AAAAAALRFhCUAAAAAsEBYAgAAAAALhCUAAAAAsEBYAgAAAAALhCUAAAAAsEBYAgAAAAALhCUAAAAAsEBYAgAAAAALhCUAaMR9990nm80W7DK8nHbaaTrttNOCXUaLfPzxx7LZbPr444+DXcoRzWaz6b777gt2GQGVnp6uSZMmBbsMSW2rFgBtB2EJAAAAACyEBrsAAIDvlixZEuwSECTl5eUKDeVtGwAOJ44sAcBhYIxReXn5IffjcDjkcDgCUBGOBG63WxUVFZKk8PBwwhIAHGaEJQBHleLiYk2bNk3p6elyOp1KSkrSGWecoe+++87T5rPPPtOvfvUrde3aVU6nU6mpqbrlllt8CjMvvfSSTj/9dCUlJcnpdCozM1OzZ89u0C49PV3nnHOO3n//fZ144omKiIjQs88+q1GjRumEE06w7LtXr14aO3Zsk+Mf/Julut8DzZs3Tw888IC6dOmi8PBwjR49Wps3b252eyZNmqT09PQGy61+r7V06VINHz5c8fHxio6OVq9evXTXXXd5tdm1a5fOP/98RUVFKSkpSbfccosqKyubrePNN9+UzWbTJ5980uC+Z599VjabTevWrZMk7d27V1dffbW6dOkip9OplJQUjR8/XtnZ2c2O88MPP+jiiy9WYmKiwsPDdeKJJ2rhwoWe+3Nzc9WhQweddtppMsZ4lm/evFlRUVGaMGGCZ9lpp52mvn376ttvv9Upp5yiiIgIZWRk6JlnnmkwbmVlpWbMmKHu3bt79rnbb7+9wdzYbDbddNNN+te//qU+ffrI6XRq8eLFnvsO/s3S7t279Zvf/EYdO3aU0+lUnz599OKLL3q1aek+smLFCo0bN04JCQmKiopS//799fjjj7doHg9FQUGBpk2bptTUVDmdTnXv3l0PP/yw3G63JKm6ulqJiYm6+uqrG6xbVFSk8PBw3XrrrZ5lvs49AFjhn6gAHFWuv/56vfnmm7rpppuUmZmpAwcO6PPPP9eGDRs0aNAgSdIbb7yhsrIy3XDDDWrXrp1WrlypJ598Urt27dIbb7zRZP+zZ89Wnz59dN555yk0NFTvvPOOfvvb38rtduvGG2/0artx40ZddtllmjJliiZPnqxevXopOjpakydP1rp169S3b19P26+//lo//vij7rnnHr+2+6GHHpLdbtett96qwsJCzZo1S5dffrlWrFjhV38HW79+vc455xz1799ff/rTn+R0OrV582Z98cUXnjbl5eUaPXq0duzYoalTp6pTp0569dVX9dFHHzXb/9lnn63o6GjNmzdPo0aN8rpv7ty56tOnj2e+LrroIq1fv16/+93vlJ6ertzcXC1dulQ7duywDH71t+HUU09V586ddeeddyoqKkrz5s3T+eefr7feeksXXHCBkpKSNHv2bP3qV7/Sk08+qalTp8rtdmvSpEmKiYnR008/7dVnfn6+xo0bp0suuUSXXXaZ5s2bpxtuuEEOh0O/+c1vJNUeHTrvvPP0+eef67rrrtPxxx+vtWvX6m9/+5t+/PFHLViwwKvPjz76SPPmzdNNN92k9u3bN7pNOTk5Ovnkkz0Bq0OHDnrvvfd0zTXXqKioSNOmTfNq78s+snTpUp1zzjlKSUnRzTffrOTkZG3YsEH//e9/dfPNN/s8j/4qKyvTqFGjtHv3bk2ZMkVdu3bVl19+qenTp2vPnj36+9//rrCwMF1wwQV6++239eyzz3odaV2wYIEqKyt16aWX+jX3ANCAAYCjSFxcnLnxxhubbFNWVtZg2cyZM43NZjPbt2/3LJsxY4Y5+GXSat2xY8eabt26eS1LS0szkszixYu9lhcUFJjw8HBzxx13eC2fOnWqiYqKMiUlJU3WPmrUKDNq1CjP38uWLTOSzPHHH28qKys9yx9//HEjyaxdu7bJ/q666iqTlpbWYPnB2/63v/3NSDL79u1rtK+///3vRpKZN2+eZ1lpaanp3r27kWSWLVvWZC2XXXaZSUpKMjU1NZ5le/bsMXa73fzpT38yxhiTn59vJJlHHnmkyb6sjB492vTr189UVFR4lrndbnPKKaeYHj16NKglMjLS/Pjjj+aRRx4xksyCBQu82owaNcpIMo8++qhnWWVlpRkwYIBJSkoyVVVVxhhjXn31VWO3281nn33mtf4zzzxjJJkvvvjCs0ySsdvtZv369Q3ql2RmzJjh+fuaa64xKSkpZv/+/V7tLr30UhMXF+fZV33dR2pqakxGRoZJS0sz+fn5Xn263W6/5rE5aWlp5qqrrvL8/ec//9lERUWZH3/80avdnXfeaUJCQsyOHTuMMca8//77RpJ55513vNqNGzfO67nYkrk/uBYAMMYYvoYH4KgSHx+vFStW6Keffmq0TUREhOf/S0tLtX//fp1yyikyxmjVqlVN9l9/3cLCQu3fv1+jRo3S1q1bVVhY6NU2IyOjwdfq4uLiNH78eP373//2fM3L5XJp7ty5nq+v+ePqq6/2+hf2ESNGSJK2bt3qV38Hi4+PlyT95z//8Xwd6mCLFi1SSkqKLr74Ys+yyMhIXXfddT6NMWHCBOXm5nqdYvzNN9+U2+32fP0tIiJCDodDH3/8sfLz832uPy8vTx999JEuueQSFRcXa//+/dq/f78OHDigsWPHatOmTdq9e7en/VNPPaW4uDhdfPHFuvfeezVx4kSNHz++Qb+hoaGaMmWK52+Hw6EpU6YoNzdX3377raTaI5nHH3+8evfu7Rl3//79Ov300yVJy5Yt8+pz1KhRyszMbHJ7jDF66623dO6558oY49Xv2LFjVVhY6PXVU6n5fWTVqlXatm2bpk2b5nm869R9JbOl89hSb7zxhkaMGKGEhASvbRozZoxcLpc+/fRTSdLpp5+u9u3ba+7cuZ518/PztXTpUq+vSrZ07gHgYIQlAEeVWbNmad26dUpNTdXQoUN13333NQgMO3bs0KRJk5SYmKjo6Gh16NDB89WvgwPPwb744guNGTNGUVFRio+PV4cOHTy/27EKS1auvPJK7dixQ5999pkk6YMPPlBOTo4mTpzo1zZLUteuXb3+TkhIkKQWBYqmTJgwQaeeeqquvfZadezYUZdeeqnmzZvnFZy2b9+u7t27N/itU69evXwa48wzz1RcXJzXB+C5c+dqwIAB6tmzpyTJ6XTq4Ycf1nvvvaeOHTtq5MiRmjVrlvbu3dtk35s3b5YxRvfee686dOjgdZsxY4ak2t8r1UlMTNQTTzyhNWvWKC4uTk888YRlv506dWoQcOtqrfsN1aZNm7R+/foG49a1qz+u1Ph+U9++fftUUFCg5557rkG/db/lObjf5vaRLVu2SJLX10MP1tJ5bKlNmzZp8eLFDfoeM2aMV9+hoaG66KKL9J///Mfz26O3335b1dXVXmGppXMPAAfjN0sAjiqXXHKJRowYofnz52vJkiV65JFH9PDDD+vtt9/WWWedJZfLpTPOOEN5eXm644471Lt3b0VFRWn37t2aNGlSo0dNpNoPk6NHj1bv3r312GOPKTU1VQ6HQ4sWLdLf/va3BuvWPwpV39ixY9WxY0e99tprGjlypF577TUlJyd7PhD6IyQkxHK5qXeSAiuNXXTX5XJ5/R0REaFPP/1Uy5Yt07vvvqvFixdr7ty5Ov3007VkyZJGx28Jp9Op888/X/Pnz9fTTz+tnJwcffHFF3rwwQe92k2bNk3nnnuuFixYoPfff1/33nuvZs6cqY8++kgDBw607Lvusbn11ls
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Визуализируем отношение размера компании и зарплаты\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"plt.scatter(df[\"salary_in_usd\"], df[\"experience_level\"])\n",
|
|||
|
"plt.xlabel(\"salary_in_usd\")\n",
|
|||
|
"plt.ylabel(\"experience_level\")\n",
|
|||
|
"plt.title(\"salary in usd vs experience_level\")\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 264,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Выбросы:\n",
|
|||
|
" work_year experience_level employment_type \\\n",
|
|||
|
"33 2023 SE FT \n",
|
|||
|
"68 2023 SE FT \n",
|
|||
|
"83 2022 EN FT \n",
|
|||
|
"133 2023 SE FT \n",
|
|||
|
"145 2023 SE FT \n",
|
|||
|
"... ... ... ... \n",
|
|||
|
"3522 2020 MI FT \n",
|
|||
|
"3675 2021 EX CT \n",
|
|||
|
"3697 2020 EX FT \n",
|
|||
|
"3747 2021 MI FT \n",
|
|||
|
"3750 2020 SE FT \n",
|
|||
|
"\n",
|
|||
|
" job_title salary salary_currency \\\n",
|
|||
|
"33 Computer Vision Engineer 342810 USD \n",
|
|||
|
"68 Applied Scientist 309400 USD \n",
|
|||
|
"83 AI Developer 300000 USD \n",
|
|||
|
"133 Machine Learning Engineer 342300 USD \n",
|
|||
|
"145 Machine Learning Engineer 318300 USD \n",
|
|||
|
"... ... ... ... \n",
|
|||
|
"3522 Research Scientist 450000 USD \n",
|
|||
|
"3675 Principal Data Scientist 416000 USD \n",
|
|||
|
"3697 Director of Data Science 325000 USD \n",
|
|||
|
"3747 Applied Machine Learning Scientist 423000 USD \n",
|
|||
|
"3750 Data Scientist 412000 USD \n",
|
|||
|
"\n",
|
|||
|
" salary_in_usd employee_residence remote_ratio company_location \\\n",
|
|||
|
"33 342810 US 0 US \n",
|
|||
|
"68 309400 US 0 US \n",
|
|||
|
"83 300000 IN 50 IN \n",
|
|||
|
"133 342300 US 0 US \n",
|
|||
|
"145 318300 US 100 US \n",
|
|||
|
"... ... ... ... ... \n",
|
|||
|
"3522 450000 US 0 US \n",
|
|||
|
"3675 416000 US 100 US \n",
|
|||
|
"3697 325000 US 100 US \n",
|
|||
|
"3747 423000 US 50 US \n",
|
|||
|
"3750 412000 US 100 US \n",
|
|||
|
"\n",
|
|||
|
" company_size \n",
|
|||
|
"33 M \n",
|
|||
|
"68 L \n",
|
|||
|
"83 L \n",
|
|||
|
"133 L \n",
|
|||
|
"145 M \n",
|
|||
|
"... ... \n",
|
|||
|
"3522 M \n",
|
|||
|
"3675 S \n",
|
|||
|
"3697 L \n",
|
|||
|
"3747 L \n",
|
|||
|
"3750 L \n",
|
|||
|
"\n",
|
|||
|
"[63 rows x 11 columns]\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA1MAAAIjCAYAAADm7UHpAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAB+ZElEQVR4nO3deXwU9f3H8ffuJrub+w4JEJJwGw4RBAE5rIAg3iceKFRFa7UerXe1YlvFo5faitevYrWtIFWLVVFUvAABBUQOOQPhTMhB7mST3fn9EbNmc+5OFhPg9Xw88oDMfuc7n+8xk/ns7M5YDMMwBAAAAAAIiLWjAwAAAACAoxHJFAAAAACYQDIFAAAAACaQTAEAAACACSRTAAAAAGACyRQAAAAAmEAyBQAAAAAmkEwBAAAAgAkkUwAAAABgAskUALRg9uzZslgsHR2Gj9NOO02nnXZaR4cRkE8++UQWi0WffPJJR4dyVLNYLJo9e3ZHhxFUGRkZmjlzZkeHIalzxQLg6EEyBQAAAAAmhHR0AAAA/33wwQcdHQI6SGVlpUJC+LMNAJ0JV6YA4EdgGIYqKyvbXY/dbpfdbg9CRDgaeDweVVVVSZKcTifJFAB0MiRTAI4ppaWluu2225SRkSGHw6Hk5GRNmjRJa9as8Zb5/PPPdckll6hHjx5yOBxKS0vT7bff7ley89JLL+n0009XcnKyHA6HsrKyNHfu3CblMjIydPbZZ+v999/XySefrLCwMD333HMaP368TjzxxGbr7tevnyZPntzq9ht/Z6r++0gLFizQww8/rO7du8vpdGrChAnavn17m+2ZOXOmMjIymixv7vtiS5Ys0ZgxYxQbG6vIyEj169dP9913n0+ZvXv36vzzz1dERISSk5N1++23q7q6us04Fi5cKIvFok8//bTJa88995wsFos2bNggSTp48KB++tOfqnv37nI4HEpNTdV5552nXbt2tbmd7777ThdffLHi4+PldDp18skna9GiRd7X8/LylJSUpNNOO02GYXiXb9++XREREZo2bZp32WmnnaaBAwfq66+/1ujRoxUWFqbMzEw9++yzTbZbXV2tBx98UL179/bOubvuuqtJ31gsFt1888365z//qQEDBsjhcGjx4sXe1xp/Z2rfvn265ppr1KVLFzkcDg0YMEB///vffcoEOkdWrlypqVOnKi4uThERERo8eLCefPLJgPqxPQ4fPqzbbrtNaWlpcjgc6t27tx577DF5PB5JUk1NjeLj4/XTn/60ybolJSVyOp264447vMv87XsAMIO3uAAcU372s59p4cKFuvnmm5WVlaWCggJ98cUX2rx5s4YOHSpJev3111VRUaEbb7xRCQkJWrVqlZ5++mnt3btXr7/+eqv1z507VwMGDNC5556rkJAQvf322/r5z38uj8ejm266yafsli1bdPnll+uGG27QrFmz1K9fP0VGRmrWrFnasGGDBg4c6C27evVqbd26Vffff7+pdj/66KOyWq264447VFxcrMcff1xXXnmlVq5caaq+xjZu3Kizzz5bgwcP1m9/+1s5HA5t375dy5Yt85aprKzUhAkTlJOTo1tuuUVdu3bVK6+8oo8//rjN+s866yxFRkZqwYIFGj9+vM9r8+fP14ABA7z9ddFFF2njxo36xS9+oYyMDOXl5WnJkiXKyclpNjFs2IZTTz1V3bp10z333KOIiAgtWLBA559/vv7zn//oggsuUHJysubOnatLLrlETz/9tG655RZ5PB7NnDlTUVFReuaZZ3zqLCoq0tSpU3XppZfq8ssv14IFC3TjjTfKbrfrmmuukVR3dencc8/VF198oeuvv14nnHCCvv32W/35z3/W1q1b9dZbb/nU+fHHH2vBggW6+eablZiY2GKbcnNzNXLkSG8ClpSUpPfee0/XXnutSkpKdNttt/mU92eOLFmyRGeffbZSU1N16623KiUlRZs3b9b//vc/3XrrrX73o1kVFRUaP3689u3bpxtuuEE9evTQ8uXLde+99+rAgQP6y1/+otDQUF1wwQV644039Nxzz/lcqX3rrbdUXV2tyy67zFTfA0DADAA4hsTExBg33XRTq2UqKiqaLJszZ45hsViM3bt3e5c9+OCDRuPDZHPrTp482ejZs6fPsvT0dEOSsXjxYp/lhw8fNpxOp3H33Xf7LL/llluMiIgIo6ysrNXYx48fb4wfP977+9KlSw1JxgknnGBUV1d7lz/55JOGJOPbb79ttb4ZM2YY6enpTZY3bvuf//xnQ5Jx6NChFuv6y1/+YkgyFixY4F1WXl5u9O7d25BkLF26tNVYLr/8ciM5Odmora31Ljtw4IBhtVqN3/72t4ZhGEZRUZEhyXjiiSdaras5EyZMMAYNGmRUVVV5l3k8HmP06NFGnz59msQSHh5ubN261XjiiScMScZbb73lU2b8+PGGJOOPf/yjd1l1dbUxZMgQIzk52XC5XIZhGMYrr7xiWK1W4/PPP/dZ/9lnnzUkGcuWLfMuk2RYrVZj48aNTeKXZDz44IPe36+99lojNTXVyM/P9yl32WWXGTExMd656u8cqa2tNTIzM4309HSjqKjIp06Px2OqH9uSnp5uzJgxw/v77373OyMiIsLYunWrT7l77rnHsNlsRk5OjmEYhvH+++8bkoy3337bp9zUqVN99sVA+r5xLADgDz7mB+CYEhsbq5UrV2r//v0tlgkLC/P+v7y8XPn5+Ro9erQMw9DatWtbrb/husXFxcrPz9f48eO1c+dOFRcX+5TNzMxs8rG9mJgYnXfeefr3v//t/RiZ2+3W/PnzvR+PM+OnP/2pzzv0Y8eOlSTt3LnTVH2NxcbGSpL++9//ej9u1di7776r1NRUXXzxxd5l4eHhuv766/3axrRp05SXl+dzC/WFCxfK4/F4P14XFhYmu92uTz75REVFRX7HX1hYqI8//liXXnqpSktLlZ+fr/z8fBUUFGjy5Mnatm2b9u3b5y3/17/+VTExMbr44ov1wAMP6KqrrtJ5553XpN6QkBDdcMMN3t/tdrtuuOEG5eXl6euvv5ZUdyX0hBNOUP/+/b3bzc/P1+mnny5JWrp0qU+d48ePV1ZWVqvtMQxD//nPf3TOOefIMAyfeidPnqzi4mKfj7ZKbc+RtWvXKjs7W7fddpt3vOvVf+Qz0H4M1Ouvv66xY8cqLi7Op00TJ06U2+3WZ599Jkk6/fTTlZiYqPnz53vXLSoq0pIlS3w+ihlo3wNAoEimABxTHn/8cW3YsEFpaWkaMWKEZs+e3SShyMnJ0cyZMxUfH6/IyEglJSV5P1rWOCFqbNmyZZo4caIiIiIUGxurpKQk7/eGmkummnP11VcrJydHn3/+uSTpww8/VG5urq666ipTbZakHj16+PweFxcnSQElHK2ZNm2aTj31VF133XXq0qWLLrvsMi1YsMAnsdq9e7d69+7d5LtW/fr182sbU6ZMUUxMjM8J8vz58zVkyBD17dtXkuRwOPTYY4/pvffeU5cuXTRu3Dg9/vjjOnjwYKt1b9++XYZh6IEHHlBSUpLPz4MPPiip7vtS9eLj4/XUU09p/fr1iomJ0VNPPdVsvV27dm2SANfHWv8drm3btmnjxo1NtltfruF2pZbnTUOHDh3S4cOH9fzzzzept/67RI3rbWuO7NixQ5J8Pn7aWKD9GKht27Zp8eLFTeqeOHGiT90hISG66KKL9N///tf73ac33nhDNTU1PslUoH0PAIHiO1MAjimXXnqpxo4dqzfffFMffPCBnnjiCT322GN64403dOaZZ8rtdmvSpEkqLCzU3Xffrf79+ysiIkL79u3TzJkzW7zqItWdbE6YMEH9+/fXn/70J6Wlpclut+vdd9/Vn//85ybrNryK1dDkyZPVpUsXvfrqqxo3bpxeffVVpaSkeE8YzbDZbM0uNxrcRKE5LT2U2O12+/weFhamzz77TEuXLtU777yjxYsXa/78+Tr99NP1wQcftLj9QDgcDp1//vl688039cwzzyg3N1fLli3TI4884lPutttu0znnnKO33npL77//vh544AHNmTNHH3/8sU466aR
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Есть шумы, убираем\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"# Статистический анализ для определения выбросов\n",
|
|||
|
"Q1 = df[\"salary_in_usd\"].quantile(0.25)\n",
|
|||
|
"Q3 = df[\"salary_in_usd\"].quantile(0.75)\n",
|
|||
|
"IQR = Q3 - Q1\n",
|
|||
|
"\n",
|
|||
|
"# Определение порога для выбросов\n",
|
|||
|
"threshold = 1.5 * IQR\n",
|
|||
|
"outliers = (df[\"salary_in_usd\"] < (Q1 - threshold)) | (\n",
|
|||
|
" df[\"salary_in_usd\"] > (Q3 + threshold)\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"# Вывод выбросов\n",
|
|||
|
"print(\"Выбросы:\")\n",
|
|||
|
"print(df[outliers])\n",
|
|||
|
"\n",
|
|||
|
"# Обработка выбросов\n",
|
|||
|
"# В данном случае мы уберем выбросы\n",
|
|||
|
"median_salary = df[\"salary_in_usd\"].median()\n",
|
|||
|
"df.loc[outliers, \"salary_in_usd\"] = 0\n",
|
|||
|
"df = df[df.salary_in_usd != 0]\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация данных после обработки\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"plt.scatter(df[\"salary_in_usd\"], df[\"experience_level\"])\n",
|
|||
|
"plt.xlabel(\"salary_in_usd\")\n",
|
|||
|
"plt.ylabel(\"experience_level\")\n",
|
|||
|
"plt.title(\"salary in usd vs experience_level\")\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Теперь создадим выборки."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 265,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки: 2214\n",
|
|||
|
"Размер контрольной выборки: 739\n",
|
|||
|
"Размер тестовой выборки: 739\n",
|
|||
|
"Распределение salary_in_usd в обучающей выборке:\n",
|
|||
|
"salary_in_usd\n",
|
|||
|
"130000 60\n",
|
|||
|
"150000 59\n",
|
|||
|
"100000 56\n",
|
|||
|
"160000 56\n",
|
|||
|
"120000 52\n",
|
|||
|
" ..\n",
|
|||
|
"127500 1\n",
|
|||
|
"9466 1\n",
|
|||
|
"57872 1\n",
|
|||
|
"134024 1\n",
|
|||
|
"122900 1\n",
|
|||
|
"Name: count, Length: 741, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение salary_in_usd в контрольной выборке:\n",
|
|||
|
"salary_in_usd\n",
|
|||
|
"100000 25\n",
|
|||
|
"150000 20\n",
|
|||
|
"140000 19\n",
|
|||
|
"120000 16\n",
|
|||
|
"135000 16\n",
|
|||
|
" ..\n",
|
|||
|
"240500 1\n",
|
|||
|
"93919 1\n",
|
|||
|
"77364 1\n",
|
|||
|
"87738 1\n",
|
|||
|
"99050 1\n",
|
|||
|
"Name: count, Length: 354, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение salary_in_usd в тестовой выборке:\n",
|
|||
|
"salary_in_usd\n",
|
|||
|
"120000 23\n",
|
|||
|
"150000 19\n",
|
|||
|
"100000 18\n",
|
|||
|
"160000 16\n",
|
|||
|
"200000 13\n",
|
|||
|
" ..\n",
|
|||
|
"109000 1\n",
|
|||
|
"133000 1\n",
|
|||
|
"245000 1\n",
|
|||
|
"51039 1\n",
|
|||
|
"146300 1\n",
|
|||
|
"Name: count, Length: 364, dtype: int64\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"\n",
|
|||
|
"# Загрузка данных\n",
|
|||
|
"train_df = pd.read_csv(\".//static//csv//train_data.csv\")\n",
|
|||
|
"val_df = pd.read_csv(\".//static//csv//val_data.csv\")\n",
|
|||
|
"test_df = pd.read_csv(\".//static//csv//test_data.csv\")\n",
|
|||
|
"\n",
|
|||
|
"# Разделение на обучающую и тестовую выборки\n",
|
|||
|
"train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение обучающей выборки на обучающую и контрольную\n",
|
|||
|
"train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"print(\"Размер обучающей выборки:\", len(train_df))\n",
|
|||
|
"print(\"Размер контрольной выборки:\", len(val_df))\n",
|
|||
|
"print(\"Размер тестовой выборки:\", len(test_df))\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"def check_balance(df, name):\n",
|
|||
|
" counts = df[\"salary_in_usd\"].value_counts()\n",
|
|||
|
" print(f\"Распределение salary_in_usd в {name}:\")\n",
|
|||
|
" print(counts)\n",
|
|||
|
" print()\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"check_balance(train_df, \"обучающей выборке\")\n",
|
|||
|
"check_balance(val_df, \"контрольной выборке\")\n",
|
|||
|
"check_balance(test_df, \"тестовой выборке\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 266,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение salary_in_usd в обучающей выборке после oversampling:\n",
|
|||
|
"salary_in_usd\n",
|
|||
|
"127221 60\n",
|
|||
|
"105000 60\n",
|
|||
|
"100000 60\n",
|
|||
|
"260000 60\n",
|
|||
|
"130000 60\n",
|
|||
|
" ..\n",
|
|||
|
"110000 60\n",
|
|||
|
"113900 60\n",
|
|||
|
"54685 60\n",
|
|||
|
"193900 60\n",
|
|||
|
"50000 60\n",
|
|||
|
"Name: count, Length: 741, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение salary_in_usd в контрольной выборке после oversampling:\n",
|
|||
|
"salary_in_usd\n",
|
|||
|
"99050 25\n",
|
|||
|
"126277 25\n",
|
|||
|
"38400 25\n",
|
|||
|
"56738 25\n",
|
|||
|
"215050 25\n",
|
|||
|
" ..\n",
|
|||
|
"75000 25\n",
|
|||
|
"140000 25\n",
|
|||
|
"100000 25\n",
|
|||
|
"175000 25\n",
|
|||
|
"90734 25\n",
|
|||
|
"Name: count, Length: 354, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение salary_in_usd в тестовой выборке после oversampling:\n",
|
|||
|
"salary_in_usd\n",
|
|||
|
"219000 23\n",
|
|||
|
"143860 23\n",
|
|||
|
"72500 23\n",
|
|||
|
"140000 23\n",
|
|||
|
"66837 23\n",
|
|||
|
" ..\n",
|
|||
|
"126000 23\n",
|
|||
|
"109000 23\n",
|
|||
|
"220000 23\n",
|
|||
|
"250000 23\n",
|
|||
|
"80000 23\n",
|
|||
|
"Name: count, Length: 364, dtype: int64\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from imblearn.over_sampling import RandomOverSampler\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"def oversample(df):\n",
|
|||
|
" X = df.drop(\"salary_in_usd\", axis=1)\n",
|
|||
|
" y = df[\"salary_in_usd\"]\n",
|
|||
|
"\n",
|
|||
|
" oversampler = RandomOverSampler(random_state=42)\n",
|
|||
|
" X_resampled, y_resampled = oversampler.fit_resample(X, y) # type: ignore\n",
|
|||
|
"\n",
|
|||
|
" resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
|
|||
|
" return resampled_df\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"train_df_oversampled = oversample(train_df)\n",
|
|||
|
"val_df_oversampled = oversample(val_df)\n",
|
|||
|
"test_df_oversampled = oversample(test_df)\n",
|
|||
|
"\n",
|
|||
|
"check_balance(train_df_oversampled, \"обучающей выборке после oversampling\")\n",
|
|||
|
"check_balance(val_df_oversampled, \"контрольной выборке после oversampling\")\n",
|
|||
|
"check_balance(test_df_oversampled, \"тестовой выборке после oversampling\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Forbes Billionaires Database"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"https://www.kaggle.com/datasets/surajjha101/forbes-billionaires-data-preprocessed Список биллионеров форбс\n",
|
|||
|
"Использование: Узнать, когда же разбогатеешь\n",
|
|||
|
"Входные данные: Имя, Возраст, Страна, компания, Индустрия"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 267,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Index(['Rank ', 'Name', 'Networth', 'Age', 'Country', 'Source', 'Industry'], dtype='object')\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"df = pd.read_csv(\".//static//csv//Forbes Billionaires.csv\")\n",
|
|||
|
"print(df.columns)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Анализируем датафрейм при помощи \"ящика с усами\". Естьсмещение в сторону меньших значений, это можно исправить при помощи oversampling и undersampling."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 268,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAx8AAAIjCAYAAABia6bHAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAsdElEQVR4nO3de5xVVd348e9cYJgEhhQFRgFBDZRCUUmRsHikvKBpKYqhQuCjJqSYlRgmWPLoKy0zCy89ChaEiSlmQoqCGMajoCGSBah4F1CRqyiXWb8/+nFeTXgZbmsYfL9fr/PK2XudmXWW25gPZ+99ilJKKQAAALaz4tqeAAAA8MkgPgAAgCzEBwAAkIX4AAAAshAfAABAFuIDAADIQnwAAABZiA8AACAL8QEAAGQhPgB2ci+++GIUFRXF6NGja3sqAHzCiQ+AGho9enQUFRVVe+yxxx7RvXv3mDRpUvb5PPLII9XmUq9evWjbtm2cddZZ8cILL2yTn/HXv/41hg8fHsuWLdsm36+2TZw4MYqKiqKysjKqqqpqezoAnziltT0BgLrmRz/6UbRp0yZSSrF48eIYPXp0HHfccXHffffF8ccfn30+F1xwQXTu3DnWrVsXTz31VNxyyy1x//33xzPPPBOVlZVb9b3/+te/xhVXXBH9+vWLJk2abJsJ16KxY8fG3nvvHS+++GJMmTIlevToUdtTAvhE8c4HwGY69thj44wzzogzzzwzvvvd78Zf/vKXqFevXowbN65W5tOtW7c444wz4pvf/GbccMMNce2118bSpUvj9ttvr5X57KhWr14d9957b3znO9+JTp06xdixY2t7SgCfOOIDYCs1adIkysvLo7S0+pvJq1evjosvvjhatmwZZWVl0a5du7j22msjpRQREWvWrIn27dtH+/btY82aNYXnLV26NFq0aBFHHHFEbNiwYbPn81//9V8REbFw4cKPHDdlypTo1q1b7LLLLtGkSZM48cQT4x//+Edh//Dhw+N73/teRES0adOmcHrXiy+++LFz+M/T0zY+HnnkkU3G9uvX7wPHDh8+vNq4u+66Kw499NBo1KhRtXHXXnvtx84nIuKee+6JNWvWRK9evaJ3795x9913x3vvvbfJuDVr1sQFF1wQTZs2jUaNGsVXv/rVeO211z5wTq+99lr0798/mjVrFmVlZdGhQ4e47bbbajQfgE8ip10BbKbly5fHW2+9FSmlWLJkSdxwww2xatWqOOOMMwpjUkrx1a9+NaZOnRoDBgyIgw46KB544IH43ve+F6+99lpcd911UV5eHrfffnt07do1hg4dGj/72c8iImLgwIGxfPnyGD16dJSUlGz2/J5//vmIiNhtt90+dMxDDz0Uxx57bLRt2zaGDx8ea9asiRtuuCG6du0aTz31VOy9997x9a9/PebPnx/jxo2L6667Lpo2bRoREbvvvnuN5vHlL385zjrrrIiImDlzZvziF7/40LFNmzaN6667rvD1mWeeWW3/jBkz4tRTT40DDzwwrr766qioqIi33norLrroohrNJeJfp1x17949mjdvHr17944hQ4bEfffdF7169ao2rl+/fnHnnXfGmWeeGYcffnhMmzYtevbsucn3W7x4cRx++OFRVFQUgwYNit133z0mTZoUAwYMiBUrVsTgwYNrPDeAT4wEQI2MGjUqRcQmj7KysjR69OhqYydMmJAiIl155ZXVtp9yyimpqKgoPffcc4Vtl156aSouLk6PPvpoGj9+fIqI9POf//xj5zN16tQUEem2225Lb775Znr99dfT/fffn/bee+9UVFSUZs6cmVJKaeHChSki0qhRowrPPeigg9Iee+yR3n777cK2p59+OhUXF6ezzjqrsO2aa65JEZEWLlxY43Vau3Ztiog0aNCgwraNr2vq1KmbjO/Tp09q06ZNtW0RkYYNG1b4+tJLL00Rkd54443Cto2v65prrvnYOS1evDiVlpamX//614VtRxxxRDrxxBOrjXvyySdTRKTBgwdX296vX79N5jRgwIDUokWL9NZbb1Ub27t371RRUZHefffdj50XwCeN064ANtOvfvWrmDx5ckyePDnGjBkT3bt3j7PPPjvuvvvuwpiJEydGSUlJXHDBBdWee/HFF0dKqdrdsYYPHx4dOnSIvn37xvnnnx9f/OIXN3neR+nfv3/svvvuUVlZGT179ozVq1fH7bffHoceeugHjn/jjTdi9uzZ0a9fv9h1110L2zt27Bhf/vKXY+LEiTX+2R9k46lMDRo0qNH4tWvXRllZ2UeOWblyZRQXF2/xRe933HFHFBcXx8knn1zYdvrpp8ekSZPinXfeKWz785//HBER559/frXnf/vb3672dUop/vCHP8QJJ5wQKaV46623Co+jjz46li9fHk899dQWzRVgZ+a0K4DN9PnPf77aL/ann356dOrUKQYNGhTHH3981K9fP1566aWorKyMRo0aVXvu/vvvHxERL730UmFb/fr147bbbovOnTtHgwYNYtSoUVFUVFTj+Vx++eXRrVu3KCkpiaZNm8b++++/yfUn/27jz27Xrt0m+/bff/944IEHYvXq1bHLLrvUeA7/7q233oqIiIqKihqNX7ZsWTRs2PAjx3Tp0iV++ctfxoUXXhjf//73o6Kiolo0fJwxY8bE5z//+Xj77bfj7bffjoiITp06xdq1a2P8+PFxzjnnRMS/1qa4uDjatGlT7fn77rtvta/ffPPNWLZsWdxyyy1xyy23fODPXLJkSY3nB/BJIT4AtlJxcXF07949rr/++liwYEF06NBhs7/HAw88EBH/etdgwYIFm/zy+1E+97nP7VC3jN14Qfree+9do/GLFi2K1q1bf+SY3r17x1NPPRU33HDDh/6y/2EWLFgQM2fOjIiI/fbbb5P9Y8eOLcRHTW38jJAzzjgj+vbt+4FjOnbsuFnfE+CTQHwAbAPr16+PiIhVq1ZFRETr1q3joYceipUrV1Z79+Of//xnYf9Gc+bMiR/96EfxzW9+M2bPnh1nn312PPPMMzV+52BzbfzZ8+bN22TfP//5z2jatGnhXY/NeQdmo1mzZkVEfOhpX/9u3bp18dxzz8UxxxzzkeOKi4vj2muvjWeeeSYWLlwYI0eOjMWLF1e7yP/DjB07NurVqxe//e1vN7mAf/r06fGLX/wiXn755WjVqlW0bt06qqqqYuHChdVC5bnnnqv2vN133z0aNWoUGzZs2KHCD2BH55oPgK20bt26ePDBB6N+/fqF06qOO+642LBhQ/zyl7+sNva6666LoqKiOPbYYwvP7devX1RWVsb1118fo0ePjsWLF2/WXZw2V4sWLeKggw6K22+/vdonl8+dOzcefPDBOO644wrbNkbI5nzC+V133RXt2rWL9u3bf+zYe++9N9asWVO4PfBHueGGG2LKlCkxduzY6NGjR3Tt2rVG8xk7dmx069YtTjvttDjllFOqPTbeSnjjZ7QcffTRERExcuTITX72vyspKYmTTz45/vCHP8TcuXM3+ZlvvvlmjeYG8EnjnQ+AzTRp0qTCOxhLliyJ3/3ud7FgwYIYMmRING7cOCIiTjjhhOjevXsMHTo0XnzxxTjwwAPjwQcfjHvvvTcGDx4c++yzT0REXHnllTF79ux4+OGHo1GjRtGxY8e4/PLL47LLLotTTjmlWghsS9dcc00ce+yx0aVLlxgwYEDhVrsVFRXVPsvikEMOiYiIoUOHRu/evaNevXpxwgknfOD1IC+88EL85Cc/iSeeeCK+/vWvx5gxYwr7Np72NHny5GjVqlU0b948hg0bFiNHjowjjjgivvKVr3zkfP/+97/H97///Rg+fHh07ty5xq/z8ccfj+eeey4GDRr0gfv33HPPOPjgg2Ps2LFxySWXxCGHHBInn3xy/PznP4+33367cKvd+fPnR0T1d4KuvvrqmDp1ahx22GHx3//933HAAQfE0qVL46mnnoqHHnooli5dWuN5Anxi1PLdtgDqjA+61W6DBg3SQQcdlG688cZUVVVVbfzKlSvTRRddlCorK1O9evXSfvvtl6655prCuCeffDKVlpamb3/
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"\n",
|
|||
|
"# Box plot для столбца Age\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"sns.boxplot(x=df['Age'])\n",
|
|||
|
"plt.title('Box Plot для Age')\n",
|
|||
|
"plt.xlabel('Age')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 269,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA1IAAAIjCAYAAAAJLyrXAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAB6P0lEQVR4nO3deXQUVfr/8U8nZGNJQpCQgCwBF4hsoiyRTREERRTRGUEYcUOHAXdnxHEUkFF0/Pobd3AbdQQBnUEQFxwUAZGwCIKGKAJGUUmIEEnCEgLp+v0Ru6VJursq3en1/Ton50DVTdWtSnV1PXXvfa7NMAxDAAAAAADTYoJdAQAAAAAINwRSAAAAAGARgRQAAAAAWEQgBQAAAAAWEUgBAAAAgEUEUgAAAABgEYEUAAAAAFhEIAUAAAAAFhFIAQAAAIBFBFIAAMu+++472Ww2vfLKK8GuioulS5eqe/fuSkxMlM1m0/79+4NdJYSgdu3a6eKLLw52NQCEOQIpADjOl19+qSuuuEJt27ZVYmKiWrVqpSFDhuipp56qt32+/vrrevzxx2ss3717t6ZNm6bNmzfX275PtGLFCtlsNudPXFyc2rdvr6uvvlrffvutX/axZs0aTZs2ze9Bzr59+/T73/9eSUlJeuaZZ/Taa6+pUaNGXn/v2Weflc1mU+/evf1an1DjCH5tNpv++9//1lg/bdo02Ww27d271/K26+tv6ov8/HxNmzZN3333XbCrAiBCEUgBwK/WrFmjs88+W1u2bNGECRP09NNP64YbblBMTIyeeOKJetuvp0Bq+vTpAQ2kHG655Ra99tprev755zV8+HAtWLBAPXv21O7du33e9po1azR9+nS/P3Rv2LBB5eXlmjFjhq6//nqNGzdOcXFxXn9v7ty5ateundavX68dO3b4tU6h6oEHHpBhGH7bXn39TX2Rn5+v6dOnE0gBqDcNgl0BAAgVDz74oFJSUrRhwwalpqa6rCsuLg5OperBwYMHvbbU9O/fX1dccYUk6dprr9Vpp52mW265Ra+++qruueeeQFTTMsff6MS/nScFBQVas2aNFi5cqJtuuklz587V1KlT66mGoaF79+7avHmz3nrrLY0aNSrY1fG7iooKxcfHB7saAKIALVIA8KudO3fqjDPOqPVBPD09vcayOXPmqFevXmrYsKGaNm2qAQMG6H//+59z/eLFizV8+HC1bNlSCQkJ6tChg2bMmKGqqipnmXPPPVfvvvuuvv/+e2e3q3bt2mnFihXq2bOnpOpAxrHu+DFJ69at07Bhw5SSkqKGDRtq4MCB+vTTT13q6OiulZ+fr6uuukpNmzZVv379LJ+bQYMGSaoOPDxZvny5+vfvr0aNGik1NVWXXnqpvvrqK5f6/PnPf5YkZWVlOY/LW6vBm2++qbPOOktJSUk66aSTNG7cOP3000/O9eeee67Gjx8vSerZs6dsNpuuueYar8c1d+5cNW3aVMOHD9cVV1yhuXPn1lpu3759+sMf/qDk5GSlpqZq/Pjx2rJlS63jxL7++mtdccUVSktLU2Jios4++2y9/fbbHutx9OhRpaWl6dprr62xrqysTImJibrrrrucy5566imdccYZzmvv7LPP1uuvv+71eCVp9OjROu2000y3Snm7zjz9TUeNGqUePXq4bG/EiBGy2Wwu52TdunWy2Wx6//33ncu+/fZb/e53v1NaWpoaNmyoPn366N1333XZlqMr6vz58/W3v/1NrVq1UsOGDfXkk0/qd7/7nSTpvPPOc9ZpxYoVLr+/evVq9erVS4mJiWrfvr3+/e9/mzqHACARSAGAU9u2bbVx40bl5eV5LTt9+nT94Q9/UFxcnB544AFNnz5drVu31vLly51lXnnlFTVu3Fh33HGHnnjiCZ111lm6//77NWXKFGeZe++9V927d9dJJ52k1157Ta+99poef/xxderUSQ888IAk6cYbb3SuGzBggKTqgGXAgAEqKyvT1KlT9dBDD2n//v0aNGiQ1q9fX6O+v/vd73To0CE99NBDmjBhguVzs3PnTklSs2bN3Jb58MMPNXToUBUXF2vatGm64447tGbNGvXt29cZKI0aNUpjxoyRJP3zn/90Hlfz5s3dbveVV17R73//e8XGxmrmzJmaMGGCFi5cqH79+jm7kt1777268cYbJVV3W3vttdd00003eT2uuXPnatSoUYqPj9eYMWO0fft2bdiwwaWM3W7XiBEjNG/ePI0fP14PPvigCgsLnYHb8bZu3ao+ffroq6++0pQpU/TYY4+pUaNGGjlypN566y239YiLi9Nll12mRYsWqbKy0mXdokWLdOTIEY0ePVqS9MILL+iWW25Rdna2Hn/8cU2fPl3du3fXunXrvB6vJMXGxupvf/ubtmzZ4rFOkrnrzNPftH///tqyZYvKysokSYZh6NNPP1VMTIw++eQT534++eQTxcTEqG/fvpKkPXv26JxzztEHH3ygP/3pT3rwwQdVUVGhSy65pNY6z5gxQ++++67uuusuPfTQQ7rgggt0yy23SJL++te/OuvUqVMn5+/s2LFDV1xxhYYMGaLHHntMTZs21TXXXKOtW7eaOo8AIAMAYBiGYfzvf/8zYmNjjdjYWCMnJ8f4y1/+YnzwwQdGZWWlS7nt27cbMTExxmWXXWZUVVW5rLPb7c5/Hzp0qMY+brrpJqNhw4ZGRUWFc9nw4cONtm3b1ii7YcMGQ5Lx8ssv19jHqaeeagwdOrTG/rKysowhQ4Y4l02dOtWQZIwZM8bUOfj4448NSca//vUv4+effzZ2795tvPvuu0a7du0Mm81mbNiwwTAMwygoKKhRt+7duxvp6enGvn37nMu2bNlixMTEGFdffbVz2aOPPmpIMgoKCrzWp7Ky0khPTzc6d+5sHD582Ln8nXfeMSQZ999/v3PZyy+/bEhy1tGbzz77zJBkLFu2zDCM6vN68sknG7feeqtLuf/+97+GJOPxxx93LquqqjIGDRpU4xycf/75RpcuXVz+vna73TjnnHOMU0891WN9PvjgA0OSsWTJEpflF110kdG+fXvn/y+99FLjjDPOMHWMx3P8zR599FHj2LFjxqmnnmp069bNeQ05rpWff/7ZWW+z15m7v6njGn7vvfcMwzCML774wpBk/O53vzN69+7tLHfJJZcYZ555pvP/t912myHJ+OSTT5zLysvLjaysLKNdu3bOz53jem3fvn2Nz9ubb75pSDI+/vjjGueibdu2hiRj1apVzmXFxcVGQkKCceedd5o6nwBAixQA/GrIkCHKzc3VJZdcoi1btugf//iHhg4dqlatWrl0Q1q0aJHsdrvuv/9+xcS43kZtNpvz30lJSc5/l5eXa+/everfv78OHTqkr7/+us713Lx5s7Zv366rrrpK+/bt0969e7V3714dPHhQ559/vlatWiW73e7yO3/84x8t7eO6665T8+bN1bJlSw0fPlwHDx7Uq6++qrPPPrvW8oWFhdq8ebOuueYapaWlOZd37dpVQ4YM0XvvvWf9QCV99tlnKi4u1p/+9CclJiY6lw8fPlwdO3as0dXLirlz56pFixY677zzJFX/7a688krNnz/fpfvl0qVLFRcX59KSFxMTo0mTJrlsr6SkRMuXL9fvf/97599779692rdvn4YOHart27e7dEc80aBBg3TSSSdpwYIFzmW//PKLli1bpiuvvNK5LDU1VT/++GONljMrjm+VWrRoUa1l6nKdnejMM89U48aNtWrVKknVLU8nn3yyrr76am3atEmHDh2SYRhavXq1+vfv7/y99957T7169XLphtq4cWPdeOON+u6775Sfn++yn/Hjx7t83szIzs522Wfz5s11+umn+y07JYDIRyAFAMfp2bOnFi5cqF9++UXr16/XPffco/Lycl1xxRXOh7edO3cqJiZG2dnZHre1detWXXbZZUpJSVFycrKaN2+ucePGSZJKS0vrXMft27dLqn54bN68ucvPiy++qCNHjtTYflZWlqV93H///Vq2bJmWL1+uL774Qrt379Yf/vAHt+W///57SdLpp59eY12nTp2cD+BWedpux44
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Выбросы:\n",
|
|||
|
" Rank Name Networth Age Country Source \\\n",
|
|||
|
"1311 1292 Kevin David Lehmann 2.4 19 Germany drugstores \n",
|
|||
|
"1961 1929 Henrique Dubugras 1.5 26 Brazil fintech \n",
|
|||
|
"1975 1929 Pedro Franceschi 1.5 25 Brazil fintech \n",
|
|||
|
"2062 1929 Wang Zelong 1.5 25 China chemicals \n",
|
|||
|
"2190 2190 Alexandra Andresen 1.3 25 Norway investments \n",
|
|||
|
"2191 2190 Katharina Andresen 1.3 26 Norway investments \n",
|
|||
|
"\n",
|
|||
|
" Industry \n",
|
|||
|
"1311 Fashion & Retail \n",
|
|||
|
"1961 Finance & Investments \n",
|
|||
|
"1975 Finance & Investments \n",
|
|||
|
"2062 Metals & Mining \n",
|
|||
|
"2190 diversified \n",
|
|||
|
"2191 diversified \n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA1IAAAIjCAYAAAAJLyrXAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABusklEQVR4nO3de3yT5f3/8XdaSkuBphQoLcihoA4KKGeonBRBUIYiuokDRefQMVDnGecU8cTUr7/NI6hz0wkCc0MRD91QTiKFIggTigyxHmmptPbAoRya+/dHl9i0SXrfadIkzev5ePB40Pu+k1xJ7rT3O9d1fS6bYRiGAAAAAACmxYS6AQAAAAAQaQhSAAAAAGARQQoAAAAALCJIAQAAAIBFBCkAAAAAsIggBQAAAAAWEaQAAAAAwCKCFAAAAABYRJACAAAAAIsIUgAAy7788kvZbDa9/PLLoW6Km+zsbPXr108JCQmy2WwqLS0NdZMQhrp166af/vSnoW4GgAhHkAKAGj799FNdfvnl6tq1qxISEtSpUyeNGzdOTz/9dNAe87XXXtOf/vSnOtsPHDig+++/Xzt27AjaY9e2bt062Ww217+4uDh1795dV199tb744ouAPMamTZt0//33BzzkFBcX6+c//7latGihZ599Vq+++qpatmxZ7+2ee+452Ww2DR06NKDtCTfO8Guz2fTPf/6zzv77779fNptNhw4dsnzfwXpPGyIvL0/333+/vvzyy1A3BUATRZACgP/ZtGmTBg0apJ07d2rmzJl65pln9Ktf/UoxMTF68skng/a4voLU/PnzGzVIOd1000169dVX9cILL2jixIlavny5Bg8erAMHDjT4vjdt2qT58+cH/KJ769atqqio0IMPPqjrrrtO06dPV1xcXL23W7Jkibp166bc3Fx9/vnnAW1TuHrggQdkGEbA7i9Y72lD5OXlaf78+QQpAEHTLNQNAIBw8fDDD8tut2vr1q1KTk5221dUVBSaRgXBkSNH6u2pGTlypC6//HJJ0rXXXqszzzxTN910k1555RXdfffdjdFMy5zvUe33zpf8/Hxt2rRJK1as0A033KAlS5Zo3rx5QWpheOjXr5927NihN954Q1OmTAl1cwKusrJSzZs3D3UzAEQBeqQA4H/279+v3r17e7wQT01NrbNt8eLFGjJkiBITE9WmTRuNGjVK//73v137V65cqYkTJ6pjx46Kj49Xjx499OCDD6qqqsp1zLnnnqt33nlHX331lWvYVbdu3bRu3ToNHjxYUnWQce6rOSdpy5YtmjBhgux2uxITEzV69Gh99NFHbm10DtfKy8vTL37xC7Vp00YjRoyw/NqMGTNGUnXw8GXNmjUaOXKkWrZsqeTkZF1yySXas2ePW3vuuOMOSVJGRobredXXa/D6669r4MCBatGihdq1a6fp06fru+++c+0/99xzNWPGDEnS4MGDZbPZdM0119T7vJYsWaI2bdpo4sSJuvzyy7VkyRKPxxUXF+uqq65SUlKSkpOTNWPGDO3cudPjPLHPPvtMl19+uVJSUpSQkKBBgwbprbfe8tmOkydPKiUlRddee22dfeXl5UpISNDtt9/u2vb000+rd+/ernNv0KBBeu211+p9vpI0depUnXnmmaZ7peo7z3y9p1OmTNGAAQPc7m/SpEmy2Wxur8mWLVtks9n03nvvubZ98cUX+tnPfqaUlBQlJiZq2LBheuedd9zuyzkUddmyZfr973+vTp06KTExUU899ZR+9rOfSZLOO+88V5vWrVvndvuNGzdqyJAhSkhIUPfu3fW3v/3N1GsIABJBCgBcunbtqm3btmnXrl31Hjt//nxdddVViouL0wMPPKD58+erc+fOWrNmjeuYl19+Wa1atdKtt96qJ598UgMHDtR9992nuXPnuo6555571K9fP7Vr106vvvqqXn31Vf3pT39Sr1699MADD0iSrr/+ete+UaNGSaoOLKNGjVJ5ebnmzZunRx55RKWlpRozZoxyc3PrtPdnP/uZjh49qkceeUQzZ860/Nrs379fktS2bVuvx7z//vsaP368ioqKdP/99+vWW2/Vpk2bNHz4cFdQmjJliq688kpJ0h//+EfX82rfvr3X+3355Zf185//XLGxsVqwYIFmzpypFStWaMSIEa6hZPfcc4+uv/56SdXD1l599VXdcMMN9T6vJUuWaMqUKWrevLmuvPJK7du3T1u3bnU7xuFwaNKkSVq6dKlmzJihhx9+WAUFBa7gVtPu3bs1bNgw7dmzR3PnztUTTzyhli1bavLkyXrjjTe8tiMuLk6XXnqp3nzzTZ04ccJt35tvvqnjx49r6tSpkqQXX3xRN910kzIzM/WnP/1J8+fPV79+/bRly5Z6n68kxcbG6ve//7127tzps02SufPM13s6cuRI7dy5U+Xl5ZIkwzD00UcfKSYmRh9++KHrcT788EPFxMRo+PDhkqSDBw/qnHPO0b/+9S/95je/0cMPP6zKykpdfPHFHtv84IMP6p133tHtt9+uRx55RBdccIFuuukmSdLvfvc7V5t69erlus3nn3+uyy+/XOPGjdMTTzyhNm3a6JprrtHu3btNvY4AIAMAYBiGYfz73/82YmNjjdjYWCMrK8u48847jX/961/GiRMn3I7bt2+fERMTY1x66aVGVVWV2z6Hw+H6/9GjR+s8xg033GAkJiYalZWVrm0TJ040unbtWufYrVu3GpKMv/71r3Ue44wzzjDGjx9f5/EyMjKMcePGubbNmzfPkGRceeWVpl6DtWvXGpKMv/zlL8b3339vHDhwwHjnnXeMbt26GTabzdi6dathGIaRn59fp239+vUzUlNTjeLiYte2nTt3GjExMcbVV1/t2vb4448bkoz8/Px623PixAkjNTXV6NOnj3Hs2DHX9rffftuQZNx3332ubX/9618NSa421ufjjz82JBmrV682DKP6dT3ttNOMm2++2e24f/7zn4Yk409/+pNrW1VVlTFmzJg6r8H5559v9O3b1+39dTgcxjnnnGOcccYZPtvzr3/9y5BkrFq1ym37RRddZHTv3t318yWXXGL07t3b1HOsyfmePf7448apU6eMM844wzj77LNd55DzXPn+++9d7TZ7nnl7T53n8LvvvmsYhmH85z//MSQZP/vZz4yhQ4e6jrv44ouN/v37u37+7W9/a0gyPvzwQ9e2iooKIyMjw+jWrZvrc+c8X7t3717n8/b6668bkoy1a9fWeS26du1qSDI2bNjg2lZUVGTEx8cbt912m6nXEwDokQKA/xk3bpxycnJ08cUXa+fOnXrsscc0fvx4derUyW0Y0ptvvimHw6H77rtPMTHuv0ZtNpvr/y1atHD9v6KiQocOHdLIkSN19OhRffbZZ363c8eOHdq3b59+8YtfqLi4WIcOHdKhQ4d05MgRnX/++dqwYYMcDofbbX79619beoxf/vKXat++vTp27KiJEyfqyJEjeuWVVzRo0CCPxxcUFGjHjh265pprlJKS4tp+1llnady4cXr33XetP1FJH3/8sYqKivSb3/xGCQkJru0TJ05Uz5496wz1smLJkiXq0KGDzjvvPEnV790VV1yhZcuWuQ2/zM7OVlxcnFtPXkxMjGbPnu12fyUlJVqzZo1+/vOfu97vQ4cOqbi4WOPHj9e+ffvchiPWNmbMGLVr107Lly93bfvhhx+0evVqXXHFFa5tycnJ+vbbb+v0nFlRs1fqzTff9HiMP+dZbf3791erVq20YcMGSdU9T6eddpquvvpqbd++XUePHpVhGNq4caNGjhzput27776rIUOGuA1DbdWqla6//np9+eWXysvLc3ucGTNmuH3ezMjMzHR7zPbt2+snP/lJwKpTAmj6CFIAUMPgwYO1YsUK/fDDD8rNzdXdd9+tiooKXX755a6Lt/379ysmJkaZmZk+72v37t269NJLZbfblZSUpPbt22v69OmSpLKyMr/buG/fPknVF4/t27d3+/fnP/9Zx48fr3P/GRkZlh7jvvvu0+rVq7VmzRr95z//0YEDB3TVVVd5Pf6rr76SJP3kJz+ps69Xr16uC3C
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Имеется смещение в меньшую сторону, в том числе и медианное\n",
|
|||
|
"df_cleaned = df.dropna()\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"plt.scatter(df[\"Age\"], df[\"Networth\"])\n",
|
|||
|
"plt.xlabel(\"Age\")\n",
|
|||
|
"plt.ylabel(\"Networth\")\n",
|
|||
|
"plt.title(\"Scatter Plot of Age vs Networth\")\n",
|
|||
|
"plt.show()\n",
|
|||
|
"\n",
|
|||
|
"# уберем шумы\n",
|
|||
|
"\n",
|
|||
|
"# Статистический анализ для определения выбросов\n",
|
|||
|
"Q1 = df[\"Age\"].quantile(0.25)\n",
|
|||
|
"Q3 = df[\"Age\"].quantile(0.75)\n",
|
|||
|
"IQR = Q3 - Q1\n",
|
|||
|
"\n",
|
|||
|
"# Определение порога для выбросов\n",
|
|||
|
"threshold = 1.5 * IQR\n",
|
|||
|
"outliers = (df[\"Age\"] < (Q1 - threshold)) | (\n",
|
|||
|
" df[\"Age\"] > (Q3 + threshold)\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"# Вывод выбросов\n",
|
|||
|
"print(\"Выбросы:\")\n",
|
|||
|
"print(df[outliers])\n",
|
|||
|
"\n",
|
|||
|
"# Обработка выбросов\n",
|
|||
|
"# В данном случае мы занулим выбросы\n",
|
|||
|
"median_charge = df[\"Age\"].median()\n",
|
|||
|
"df.loc[outliers, \"Age\"] = 0\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация данных после обработки\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"plt.scatter(df[\"Age\"], df[\"Networth\"])\n",
|
|||
|
"plt.xlabel(\"Age\")\n",
|
|||
|
"plt.ylabel(\"Networth\")\n",
|
|||
|
"plt.title(\"Scatter Plot of Age vs Networth\")\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Разбиение набора данных на обучающую, контрольную и тестовую выборки"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 270,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки: 1560\n",
|
|||
|
"Размер контрольной выборки: 520\n",
|
|||
|
"Размер тестовой выборки: 520\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"\n",
|
|||
|
"# Разделение на обучающую и тестовую выборки\n",
|
|||
|
"train_df, test_df = train_test_split(df_cleaned, test_size=0.2, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение обучающей выборки на обучающую и контрольную\n",
|
|||
|
"train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"print(\"Размер обучающей выборки:\", len(train_df))\n",
|
|||
|
"print(\"Размер контрольной выборки:\", len(val_df))\n",
|
|||
|
"print(\"Размер тестовой выборки:\", len(test_df))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Видим недостаток баланса:"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 271,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение Age в обучающей выборке:\n",
|
|||
|
"Age\n",
|
|||
|
"64 95\n",
|
|||
|
"66 53\n",
|
|||
|
"58 51\n",
|
|||
|
"59 50\n",
|
|||
|
"56 47\n",
|
|||
|
" ..\n",
|
|||
|
"98 2\n",
|
|||
|
"30 1\n",
|
|||
|
"29 1\n",
|
|||
|
"27 1\n",
|
|||
|
"25 1\n",
|
|||
|
"Name: count, Length: 73, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение Age в контрольной выборке:\n",
|
|||
|
"Age\n",
|
|||
|
"64 27\n",
|
|||
|
"54 23\n",
|
|||
|
"60 17\n",
|
|||
|
"57 15\n",
|
|||
|
"81 15\n",
|
|||
|
" ..\n",
|
|||
|
"27 1\n",
|
|||
|
"32 1\n",
|
|||
|
"29 1\n",
|
|||
|
"19 1\n",
|
|||
|
"42 1\n",
|
|||
|
"Name: count, Length: 66, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение Age в тестовой выборке:\n",
|
|||
|
"Age\n",
|
|||
|
"64 30\n",
|
|||
|
"68 24\n",
|
|||
|
"72 22\n",
|
|||
|
"65 22\n",
|
|||
|
"58 18\n",
|
|||
|
" ..\n",
|
|||
|
"100 1\n",
|
|||
|
"88 1\n",
|
|||
|
"93 1\n",
|
|||
|
"91 1\n",
|
|||
|
"33 1\n",
|
|||
|
"Name: count, Length: 62, dtype: int64\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"def check_balance(df, name):\n",
|
|||
|
" counts = df['Age'].value_counts()\n",
|
|||
|
" print(f\"Распределение Age в {name}:\")\n",
|
|||
|
" print(counts)\n",
|
|||
|
" print()\n",
|
|||
|
"\n",
|
|||
|
"check_balance(train_df, \"обучающей выборке\")\n",
|
|||
|
"check_balance(val_df, \"контрольной выборке\")\n",
|
|||
|
"check_balance(test_df, \"тестовой выборке\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Используем oversample"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 272,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение Age в обучающей выборке после oversampling:\n",
|
|||
|
"Age\n",
|
|||
|
"59 95\n",
|
|||
|
"70 95\n",
|
|||
|
"71 95\n",
|
|||
|
"81 95\n",
|
|||
|
"67 95\n",
|
|||
|
" ..\n",
|
|||
|
"94 95\n",
|
|||
|
"29 95\n",
|
|||
|
"96 95\n",
|
|||
|
"27 95\n",
|
|||
|
"25 95\n",
|
|||
|
"Name: count, Length: 73, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение Age в контрольной выборке после oversampling:\n",
|
|||
|
"Age\n",
|
|||
|
"57 27\n",
|
|||
|
"69 27\n",
|
|||
|
"72 27\n",
|
|||
|
"64 27\n",
|
|||
|
"54 27\n",
|
|||
|
" ..\n",
|
|||
|
"29 27\n",
|
|||
|
"38 27\n",
|
|||
|
"19 27\n",
|
|||
|
"89 27\n",
|
|||
|
"42 27\n",
|
|||
|
"Name: count, Length: 66, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение Age в тестовой выборке после oversampling:\n",
|
|||
|
"Age\n",
|
|||
|
"68 30\n",
|
|||
|
"70 30\n",
|
|||
|
"76 30\n",
|
|||
|
"74 30\n",
|
|||
|
"64 30\n",
|
|||
|
" ..\n",
|
|||
|
"42 30\n",
|
|||
|
"88 30\n",
|
|||
|
"93 30\n",
|
|||
|
"91 30\n",
|
|||
|
"33 30\n",
|
|||
|
"Name: count, Length: 62, dtype: int64\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from imblearn.over_sampling import RandomOverSampler\n",
|
|||
|
"\n",
|
|||
|
"def oversample(df):\n",
|
|||
|
" X = df.drop('Age', axis=1)\n",
|
|||
|
" y = df['Age']\n",
|
|||
|
" \n",
|
|||
|
" oversampler = RandomOverSampler(random_state=42)\n",
|
|||
|
" X_resampled, y_resampled = oversampler.fit_resample(X, y) # type: ignore\n",
|
|||
|
" \n",
|
|||
|
" resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
|
|||
|
" return resampled_df\n",
|
|||
|
"\n",
|
|||
|
"train_df_oversampled = oversample(train_df)\n",
|
|||
|
"val_df_oversampled = oversample(val_df)\n",
|
|||
|
"test_df_oversampled = oversample(test_df)\n",
|
|||
|
"\n",
|
|||
|
"check_balance(train_df_oversampled, \"обучающей выборке после oversampling\")\n",
|
|||
|
"check_balance(val_df_oversampled, \"контрольной выборке после oversampling\")\n",
|
|||
|
"check_balance(test_df_oversampled, \"тестовой выборке после oversampling\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## 100 Highest-Valued Unicorns\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"https://www.kaggle.com/datasets/ankanhore545/100-highest-valued-unicorns Самые взлетевшие компании.\n",
|
|||
|
"Цель: создать свою супер-компанию\n",
|
|||
|
"Входные данные: Название компании, оценочная стоимость, страна, штат, город, индустрия, год основания, имя основателя, количество работников"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 273,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
" id gender age hypertension heart_disease ever_married \\\n",
|
|||
|
"0 9046 Male 67 0 1 Yes \n",
|
|||
|
"1 51676 Female 61 0 0 Yes \n",
|
|||
|
"2 31112 Male 80 0 1 Yes \n",
|
|||
|
"3 60182 Female 49 0 0 Yes \n",
|
|||
|
"4 1665 Female 79 1 0 Yes \n",
|
|||
|
"... ... ... ... ... ... ... \n",
|
|||
|
"5105 18234 Female 80 1 0 Yes \n",
|
|||
|
"5106 44873 Female 81 0 0 Yes \n",
|
|||
|
"5107 19723 Female 35 0 0 Yes \n",
|
|||
|
"5108 37544 Male 51 0 0 Yes \n",
|
|||
|
"5109 44679 Female 44 0 0 Yes \n",
|
|||
|
"\n",
|
|||
|
" work_type Residence_type avg_glucose_level bmi smoking_status \\\n",
|
|||
|
"0 Private Urban 228.69 36.6 formerly smoked \n",
|
|||
|
"1 Self-employed Rural 202.21 NaN never smoked \n",
|
|||
|
"2 Private Rural 105.92 32.5 never smoked \n",
|
|||
|
"3 Private Urban 171.23 34.4 smokes \n",
|
|||
|
"4 Self-employed Rural 174.12 24.0 never smoked \n",
|
|||
|
"... ... ... ... ... ... \n",
|
|||
|
"5105 Private Urban 83.75 NaN never smoked \n",
|
|||
|
"5106 Self-employed Urban 125.20 40.0 never smoked \n",
|
|||
|
"5107 Self-employed Rural 82.99 30.6 never smoked \n",
|
|||
|
"5108 Private Rural 166.29 25.6 formerly smoked \n",
|
|||
|
"5109 Govt_job Urban 85.28 26.2 Unknown \n",
|
|||
|
"\n",
|
|||
|
" stroke \n",
|
|||
|
"0 1 \n",
|
|||
|
"1 1 \n",
|
|||
|
"2 1 \n",
|
|||
|
"3 1 \n",
|
|||
|
"4 1 \n",
|
|||
|
"... ... \n",
|
|||
|
"5105 0 \n",
|
|||
|
"5106 0 \n",
|
|||
|
"5107 0 \n",
|
|||
|
"5108 0 \n",
|
|||
|
"5109 0 \n",
|
|||
|
"\n",
|
|||
|
"[5110 rows x 12 columns]\n",
|
|||
|
"Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',\n",
|
|||
|
" 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',\n",
|
|||
|
" 'smoking_status', 'stroke'],\n",
|
|||
|
" dtype='object')\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"df = pd.read_csv(\".//static//csv//healthcare-dataset-stroke-data.csv\", sep=\",\")\n",
|
|||
|
"\n",
|
|||
|
"df[\"age\"] = df[\"age\"].astype(int)\n",
|
|||
|
"print(df)\n",
|
|||
|
"df[\"age\"].dtype\n",
|
|||
|
"\n",
|
|||
|
"print(df.columns)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 274,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAx8AAAIQCAYAAADghdPEAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAApkklEQVR4nO3de3SU9Z348U9CIASEACKXKCBwRBRBragN6FortqtIXa21tlgvpdVVXNH2aLWooNXVqq2rVlH2WLEV3dVWa70dpaKw1oioWMQLoCJeAlJEQgQilzy/P/rLlMgtRPjGJK/XORwzz3wnz2eY58S8mXlm8rIsywIAAGAHy2/oAQAAgOZBfAAAAEmIDwAAIAnxAQAAJCE+AACAJMQHAACQhPgAAACSEB8AAEAS4gMAAEhCfAAAAEmID4Bm4N133428vLyYNGlSQ4/SICZNmhR5eXnx4osv7vB9jR8/PvLy8nb4fgAaI/EBsA1qfond8E+XLl3i8MMPj8cffzz5PM8880ytWVq2bBl9+vSJU045Jd55553tso/nnnsuxo8fH8uXL98u3w+A5kt8ANTDFVdcEb///e/jd7/7XVx44YXx97//PY4++uh45JFHGmSec889N37/+9/HxIkTY/jw4fG///u/ceCBB0Z5efkX/t7PPfdcXH755eKjji655JJYvXp1Q48B8KVU0NADADRGRx11VAwePDh3edSoUdG1a9e4995745hjjkk+z6GHHhonnHBCREScfvrp0a9fvzj33HPjrrvuiosvvjj5PM1ZQUFBFBT43yvApnjmA2A76NChQxQVFW30S+fKlSvjpz/9afTo0SMKCwtjzz33jOuvvz6yLIuIiNWrV0f//v2jf//+tf61fNmyZdG9e/cYMmRIrF+/fpvn+frXvx4REQsWLNjiuqlTp8ahhx4abdu2jQ4dOsSxxx4bb7zxRu768ePHxwUXXBAREb179869vOvdd9/d6gyff3lazZ9nnnlmo7WnnXbaJteOHz++1ro//OEPMXjw4GjXrl2tdddff/1W54mIWLVqVZx55pmx8847R/v27eOUU06JTz75pNaa3XffPY455ph45plnYvDgwVFUVBQDBw7Mzf3AAw/EwIEDo3Xr1nHAAQfErFmzat3eOR8Am+efZgDqoaKiIpYuXRpZlsWSJUvi5ptvjk8//TROPvnk3Josy+Jb3/pWPP300zFq1KjYb7/94oknnogLLrggPvzww7jhhhuiqKgo7rrrrhg6dGiMHTs2fv3rX0dExOjRo6OioiImTZoULVq02Ob53n777YiI2HnnnTe75i9/+UscddRR0adPnxg/fnysXr06br755hg6dGi8/PLLsfvuu8fxxx8f8+bNi3vvvTduuOGG6Ny5c0RE7LLLLnWa48gjj4xTTjklIiJmzpwZN91002bXdu7cOW644Ybc5R/84Ae1ri8rK4sTTzwx9t1337jmmmuiuLg4li5dGueff36dZomIOOecc6JDhw4xfvz4mDt3bkyYMCEWLlyYO3emxltvvRXf//7348wzz4yTTz45rr/++hgxYkTcdttt8fOf/zzOPvvsiIi4+uqr48QTT4y5c+dGfr5/zwPYqgyAOrvzzjuziNjoT2FhYTZp0qRaa//0pz9lEZFdeeWVtbafcMIJWV5eXvbWW2/ltl188cVZfn5+Nn369Oz+++/PIiL7r//6r63O8/TTT2cRkf32t7/N/v73v2fl5eXZo48+mu2+++5ZXl5eNnPmzCzLsmzBggVZRGR33nln7rb77bdf1qVLl+zjjz/Obfvb3/6W5efnZ6ecckpu23XXXZdFRLZgwYI6/z2tWbMmi4jsnHPOyW2ruV9PP/30RutHjhyZ9e7du9a2iMjGjRuXu3zxxRdnEZEtWrQot63mfl133XVbnKfmcTvggAOyNWvW5LZfe+21WURkDz30UG5br169sojInnvuudy2J554IouIrKioKFu4cGFu++23377RfRo3blzmf68Am+afaQDq4ZZbbokpU6bElClT4u67747DDz88fvSjH8UDDzyQW/PYY49FixYt4txzz61125/+9KeRZVmtd8caP358DBgwIE499dQ4++yz47DDDtvodlvywx/+MHbZZZcoKSmJ4cOHx8qVK+Ouu+6qdV7KhhYtWhSvvPJKnHbaadGpU6fc9kGDBsWRRx4Zjz32WJ33vSlVVVUREdG6des6rV+zZk0UFhZucU1lZWXk5+dHhw4d6j3XGWecES1btsxdPuuss6KgoGCj+7v33ntHaWlp7vLBBx8cEf94OVvPnj032r693lkMoKkTHwD1cNBBB8WwYcNi2LBhMXLkyHj00Udj7733jnPOOSfWrFkTERELFy6MkpKSaNeuXa3b7rXXXrnra7Rq1Sp++9vfxoIFC6KysjLuvPPObTpv4LLLLospU6bE1KlTY/bs2VFeXr7Ry5Y2VLPvPffcc6Pr9tprr1i6dGmsXLmyzvv/vKVLl0ZERHFxcZ3WL1++PHbaaactriktLY3q6uoYM2ZMvP3227F06dKNztfYmj322KPW5Z122im6d+++0TksGwZGxD/vR48ePTa5fVvnAGiunPMBsB3k5+fH4YcfHjfeeGPMnz8/BgwYsM3f44knnoiIfzxrMH/+/Ojdu3edbztw4MAYNmzYNu9zR6n5ZX733Xev0/rFixdHr169trjmpJNOipdffjluvvnmmDhx4heccMs2d57N5rZn//8NBADYMs98AGwn69ati4iITz/9NCIievXqFeXl5VFZWVlr3Ztvvpm7vsbs2bPjiiuuiNNPPz3233//+NGPfhQVFRU7bNaafc+dO3ej6958883o3LlztG3bNiKiXu/cVPNJ4pt72deG1q5dG2+99VbuGaHNyc/Pj+uvvz6+9rWvxR577JF7ydu2mD9/fq3Ln376aSxatKjOkQTAFyM+ALaDtWvXxpNPPhmtWrXK/RJ99NFHx/r16+M3v/lNrbU33HBD5OXlxVFHHZW77WmnnRYlJSVx4403xqRJk+Kjjz7apndx2lbdu3eP/fbbL+66665aHx44Z86cePLJJ+Poo4/ObauJkG35kME//OEPseeee0b//v23uvahhx6K1atX594eeEtuvvnmmDp1akyePDmGDRsWQ4cOrfNMERETJ06MtWvX5i5PmDAh1q1bl3ssANixvOwKoB4ef/zx3DMYS5YsiXvuuSfmz58fF110UbRv3z4iIkaMGBGHH354jB07Nt59993Yd99948knn4yHHnoozjvvvOjbt29ERFx55ZXxyiuvxFNPPRXt2rWLQYMGxWWXXRaXXHJJnHDCCbVCYHu67rrr4qijjorS0tIYNWpU7q12i4uLa32+xgEHHBAREWPHjo2TTjopWrZsGSNGjMhFyYbeeeeduPbaa+OFF16I448/vtYzEzNnzoyIiClTpkTPnj2jW7duMW7cuLj11ltjyJAh8Y1vfGOL87722mtx4YUXxvjx4+PAAw+s131es2ZNHHHEEbm3x7311lvjkEMOiW9961v1+n4AbKOGfrstgMZkU2+127p162y//fbLJkyYkFVXV9daX1lZmZ1//vlZSUlJ1rJly2yPPfbIrrvuuty6l156KSsoKMj+4z/+o9bt1q1blx144IFZSUlJ9sknn2x2npq32r3//vu3OPem3mo3y7LsL3/5SzZ06NCsqKgoa9++fTZixIjs9ddf3+j2v/jFL7Jdd901y8/P3+Lb7m7urYg//+fOO+/MPvjgg6xHjx7Zeeedl1VUVGz0vWKDt9qtqqrKBg0alB1yyCHZunXrNrpfdX2r3WnTpmVnnHFG1rFjx2ynnXbKRo4cWeuthrPsH2+1O3z48E3OM3r06FrbNrV/b7ULsHl5WeYsOQC2j0mTJsX48eO3+AnoX/va1+K0006L0047LdlcAHw5OOcDAABIQnwAsN307ds3jjvuuC2uOfLII3PnuwDQvHjZFQAAkIRnPgAAgCTEBwAAkES9P+ejuro6ysvLo127dvX69FsAAKBpyLIsKisro6SkJPLzN//8Rr3jo7y8PHr06FHfmwMAAE3
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"sns.boxplot(x=df[\"bmi\"])\n",
|
|||
|
"plt.title('Box Plot для bmi')\n",
|
|||
|
"plt.xlabel('')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 275,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA1IAAAIjCAYAAAAJLyrXAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAADsK0lEQVR4nOzdeVxU9foH8M8MDDsMIOmgmSCuhEqaphezRE2UXLJNba+fbdqi95ZZWZqV2Wbeq9XNFkuvS7vmVhCWSZjljlgqgZaCCiggO8z5/UFnmuUs37PMAjzv16vX6wpnznznzBnu95nv830eA8dxHAghhBBCCCGEMDN6ewCEEEIIIYQQ0tJQIEUIIYQQQgghClEgRQghhBBCCCEKUSBFCCGEEEIIIQpRIEUIIYQQQgghClEgRQghhBBCCCEKUSBFCCGEEEIIIQpRIEUIIYQQQgghClEgRQghhBBCCCEKUSBFCCEtQGFhIQwGA1asWOHtoTjYunUrkpOTERQUBIPBgPPnz3t7SEQn3333HQwGA7777jtvD8Wj5s2bB4PB4O1hAPCtsRBCXFEgRQjxqoMHD+KGG25Aly5dEBQUhE6dOmHUqFH4z3/+47bnXL16Nd544w2Xn586dQrz5s3Dvn373PbczvjJKv+fyWRC165dcfvtt+P333/X5Tl+/PFHzJs3T/cgp7S0FDfddBOCg4OxbNkyrFy5EqGhoYLHrlixwuF1GgwGtG/fHsOHD8eWLVtcjueP+b//+z/B8z311FO2Y0pKSmw/v/POOxEWFqbPCySEEEIk+Ht7AISQtuvHH3/E8OHDcckll2DatGmwWCz4448/sHPnTixZsgQPPfSQW5539erVyM3NxaOPPurw81OnTmH+/PmIi4tDcnKyW55bzMMPP4yBAweioaEBe/bswTvvvINNmzbh4MGD6Nixo6Zz//jjj5g/fz7uvPNOREZG6jNgAD///DMqKyuxYMECjBw5kukxzz33HOLj48FxHE6fPo0VK1Zg7Nix+Oqrr3Dttdc6HBsUFITPPvsMb775JgICAhx+t2bNGgQFBaG2tla310MIIYQoQYEUIcRrXnjhBZjNZvz8888uE/wzZ854Z1BuUFVVJbpSw7vyyitxww03AADuuusu9OjRAw8//DA+/PBDzJkzxxPDVIx/j5QEZ2PGjMHll19u+/c999yDDh06YM2aNS6BVFpaGjZs2IAtW7ZgwoQJtp//+OOPKCgowPXXX4/PPvtM24sghBBCVKLUPkKI1+Tn5+PSSy8VnIi3b9/e5WerVq3CoEGDEBISgqioKAwbNgzffPON7ffr169Heno6OnbsiMDAQCQkJGDBggVoamqyHXP11Vdj06ZNOH78uC01LC4uDt999x0GDhwIoDmQ4X9nvyfpp59+QlpaGsxmM0JCQnDVVVchOzvbYYz8noa8vDxMnToVUVFRGDp0qOJrk5qaCgAoKCiQPC4rKwtXXnklQkNDERkZiQkTJuDw4cMO43nssccAAPHx8bbXVVhYKHneTz75BAMGDEBwcDBiYmJw66234uTJk7bfX3311bjjjjsAAAMHDoTBYMCdd96p+HVGRkYiODgY/v6u3+t16tQJw4YNw+rVqx1+/r///Q99+vRBUlKS4ucDgFdffRUGgwHHjx93+d2cOXMQEBCAc+fOAQCOHj2K66+/HhaLBUFBQbj44osxefJklJeXSz7HDz/8gBtvvBGXXHIJAgMD0blzZ8ycORM1NTWqxgEAy5YtQ9euXREcHIxBgwbhhx9+wNVXX42rr75a0eu3Wq2YN28eOnbsiJCQEAwfPhx5eXmIi4uTfQ/FjhEaR21tLebNm4cePXogKCgIsbGxmDRpEvLz823HVFVV4Z///Cc6d+6MwMBA9OzZE6+++io4jnM4V0ZGBoYOHYrIyEiEhYWhZ8+eePLJJx2Oqaurw7PPPotu3brZrvnjjz+Ouro6RddHzKpVq2yfiejoaEyePBl//PGH7fczZsxAWFgYqqurXR47ZcoUWCwWh79FW7ZssX12w8PDkZ6ejkOHDukyVkKIZ1AgRQjxmi5dumD37t3Izc2VPXb+/Pm47bbbYDKZ8Nxzz2H+/Pno3LkzsrKybMesWLECYWFhmDVrFpYsWYIBAwbgmWeewRNPPGE75qmnnkJycjJiYmKwcuVKrFy5Em+88QZ69+6N5557DgBw77332n43bNgwAM0By7Bhw1BRUYFnn30WL774Is6fP4/U1FTs2rXLZbw33ngjqqur8eKLL2LatGmKrw0/2WzXrp3oMZmZmRg9ejTOnDmDefPmYdasWfjxxx+RkpJiC5QmTZqEKVOmAAAWL15se10XXXSR6HlXrFiBm266CX5+fli4cCGmTZuGzz//HEOHDrXts3rqqadw7733AmhO11u5ciXuu+8+2ddVXl6OkpISnD17FocOHcIDDzyACxcu4NZbbxU8furUqfjqq69w4cIFAEBjYyM++eQTTJ06Vfa5xNx0000wGAz4+OOPXX738ccf45prrkFUVBTq6+sxevRo7Ny5Ew899BCWLVuGe++9F7///rvsfrNPPvkE1dXVeOCBB/Cf//wHo0ePxn/+8x/cfvvtiscBAG+99RZmzJiBiy++GC+//DKuvPJKTJw4EX/++afi1z9nzhzMnz8fl19+OV555RV0794do0ePRlVVleJziWlqasK1116L+fPnY8CAAXjttdfwyCOPoLy83PZ55zgO48ePx+LFi5GWlobXX38dPXv2xGOPPYZZs2bZznXo0CFce+21qKurw3PPPYfXXnsN48ePd/gSw2q1Yvz48Xj11Vcxbtw4/Oc//8HEiROxePFi3HzzzZpfzwsvvIDbb78d3bt3x+uvv45HH30U3377LYYNG2a7F26++WZUVVVh06ZNDo+trq7GV199hRtuuAF+fn4AgJUrVyI9PR1hYWFYtGgR5s6di7y8PAwdOlT2Sw5CiA/hCCHES7755hvOz8+P8/Pz44YMGcI9/vjj3Ndff83V19c7HHf06FHOaDRy1113HdfU1OTwO6vVavvf1dXVLs9x3333cSEhIVxtba3tZ+np6VyXLl1cjv355585ANwHH3zg8hzdu3fnRo8e7fJ88fHx3KhRo2w/e/bZZzkA3JQpU5iuwbZt2zgA3Pvvv8+dPXuWO3XqFLdp0yYuLi6OMxgM3M8//8xxHMcVFBS4jC05OZlr3749V1paavvZ/v37OaPRyN1+++22n73yyiscAK6goEB2PPX19Vz79u25pKQkrqamxvbzjRs3cgC4Z555xvazDz74gANgG6MU/ljn/wIDA7kVK1a4HA+Amz59OldWVsYFBARwK1eu5DiO4zZt2sQZDAausLDQdq3Pnj1re9wdd9zBhYaGyo5nyJAh3IABAxx+tmvXLg4A99FHH3Ecx3F79+7lAHCffPKJ7PmcCd2LCxcu5AwGA3f8+HFF46irq+PatWvHDRw4kGtoaLAdt2LFCg4Ad9VVVzGPq7i4mPP39+cmTpzo8PN58+ZxALg77rjD9jP+3ty2bZvtZ126dHE4hnfVVVc5jOP999/nAHCvv/66y7H8Z+jLL7/kAHDPP/+8w+9vuOEGzmAwcMeOHeM4juMWL17s8j47W7lyJWc0GrkffvjB4edvv/02B4DLzs4Wfawz/r7iFRYWcn5+ftwLL7zgcNzBgwc5f39/28+tVivXqVMn7vrrr3c47uOPP+YAcNu3b+c4juMqKyu5yMhIbtq0aQ7HFRcXc2az2eHnzmMhhPgWWpEihHjNqFGjkJOTg/Hjx2P//v14+eWXMXr0aHTq1AkbNmywHffll1/CarXimWeegdHo+GfLvjRwcHCw7X9XVlaipKQEV155Jaqrq/Hrr7+qHue+fftw9OhRTJ06FaWlpSgpKUFJSQmqqqowYsQIbN++HVar1eEx999/v6LnuPvuu3HRRRehY8eOSE9PR1VVFT788EOH/UT2ioqKsG/fPtx5552Ijo62/bxv374YNWoUNm/erPyFAvjll19w5swZPPjggwgKCrL9PD09Hb169XL5tl2pZcuWISMjAxkZGVi1ahWGDx+
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация данных после обработки\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"plt.scatter(df[\"bmi\"], df[\"avg_glucose_level\"])\n",
|
|||
|
"plt.xlabel(\"bmi\")\n",
|
|||
|
"plt.ylabel(\"avg_glucose_level\")\n",
|
|||
|
"plt.title(\"Scatter Plot of BMI vs avg_glucose_level\")\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Удаление строк с пустыми значениями"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 276,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Выбросы:\n",
|
|||
|
" id gender age hypertension heart_disease ever_married \\\n",
|
|||
|
"21 13861 Female 52 1 0 Yes \n",
|
|||
|
"113 41069 Female 45 0 0 Yes \n",
|
|||
|
"254 32257 Female 47 0 0 Yes \n",
|
|||
|
"258 28674 Female 74 1 0 Yes \n",
|
|||
|
"270 72911 Female 57 1 0 Yes \n",
|
|||
|
"... ... ... ... ... ... ... \n",
|
|||
|
"4858 1696 Female 43 0 0 Yes \n",
|
|||
|
"4906 72696 Female 53 0 0 Yes \n",
|
|||
|
"4952 16245 Male 51 1 0 Yes \n",
|
|||
|
"5009 40732 Female 50 0 0 Yes \n",
|
|||
|
"5057 38349 Female 49 0 0 Yes \n",
|
|||
|
"\n",
|
|||
|
" work_type Residence_type avg_glucose_level bmi smoking_status \\\n",
|
|||
|
"21 Self-employed Urban 233.29 48.9 never smoked \n",
|
|||
|
"113 Private Rural 224.10 56.6 never smoked \n",
|
|||
|
"254 Private Urban 210.95 50.1 Unknown \n",
|
|||
|
"258 Self-employed Urban 205.84 54.6 never smoked \n",
|
|||
|
"270 Private Rural 129.54 60.9 smokes \n",
|
|||
|
"... ... ... ... ... ... \n",
|
|||
|
"4858 Private Urban 100.88 47.6 smokes \n",
|
|||
|
"4906 Private Urban 70.51 54.1 never smoked \n",
|
|||
|
"4952 Self-employed Rural 211.83 56.6 never smoked \n",
|
|||
|
"5009 Self-employed Rural 126.85 49.5 formerly smoked \n",
|
|||
|
"5057 Govt_job Urban 69.92 47.6 never smoked \n",
|
|||
|
"\n",
|
|||
|
" stroke \n",
|
|||
|
"21 1 \n",
|
|||
|
"113 1 \n",
|
|||
|
"254 0 \n",
|
|||
|
"258 0 \n",
|
|||
|
"270 0 \n",
|
|||
|
"... ... \n",
|
|||
|
"4858 0 \n",
|
|||
|
"4906 0 \n",
|
|||
|
"4952 0 \n",
|
|||
|
"5009 0 \n",
|
|||
|
"5057 0 \n",
|
|||
|
"\n",
|
|||
|
"[110 rows x 12 columns]\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA1IAAAIjCAYAAAAJLyrXAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOydeXwU5f3HP7shNzmJsAGBhIBABEQQBIMoESSAgoJVDg/UYlXwgFbxooJYEW1FWqj+RAQrBW8LCqJEUCQGsdwhKBATqJCgSSCB3GTn90eYZY85nmeO3dnk+3699EV2Z2eemXlmnu/9tQmCIIAgCIIgCIIgCIJgxh7oARAEQRAEQRAEQQQbpEgRBEEQBEEQBEFwQooUQRAEQRAEQRAEJ6RIEQRBEARBEARBcEKKFEEQBEEQBEEQBCekSBEEQRAEQRAEQXBCihRBEARBEARBEAQnpEgRBEEQBEEQBEFwQooUQRAEQRAEQRAEJ6RIEQRBBAFFRUWw2WxYuXJloIfiwcaNG9G3b19ERETAZrPh9OnTgR4SYRBff/01bDYbvv7660APxa/MnTsXNpst0MMAYK2xEAThCylSBEEElP379+OWW25B586dERERgQ4dOmDEiBH4xz/+YdoxV69ejVdffdXn8xMnTmDu3LnYs2ePacf2RhRWxf9CQ0PRpUsX3Hnnnfj5558NOcZ3332HuXPnGq7klJWV4dZbb0VkZCSWLl2Kd955B9HR0ZLbrly50uM8bTYb2rZti2HDhuHzzz/32V7c5ve//73k/p5++mnXNqWlpa7Pp06ditatWxtzggRBEAShQKtAD4AgiJbLd999h2HDhqFTp06YNm0aHA4H/ve//2H79u1YvHgxHnroIVOOu3r1auTl5eHRRx/1+PzEiROYN28eUlJS0LdvX1OOLcfDDz+MAQMGoKGhAbt27cIbb7yB9evXY//+/Wjfvr2ufX/33XeYN28epk6divj4eGMGDOCHH37AmTNnMH/+fAwfPpzpN8899xxSU1MhCAJOnjyJlStXYvTo0fj0009xww03eGwbERGBjz76CP/85z8RFhbm8d2aNWsQERGB2tpaw86HIAiCIHggRYogiIDxl7/8BXFxcfjhhx98BPxff/01MIMygaqqKllPjcjVV1+NW265BQBw991345JLLsHDDz+Mt99+G08++aQ/hsmNeI94lLNRo0bhiiuucP197733ol27dlizZo2PIpWVlYV169bh888/x7hx41yff/fddygsLMSECRPw0Ucf6TsJgiAIgtAIhfYRBBEwCgoKcOmll0oK4m3btvX5bNWqVRg4cCCioqKQkJCAoUOH4ssvv3R9v3btWowZMwbt27dHeHg40tLSMH/+fDQ2Nrq2ufbaa7F+/XocPXrUFRqWkpKCr7/+GgMGDADQpMiI37nnJH3//ffIyspCXFwcoqKicM011yAnJ8djjGJOQ35+PiZPnoyEhAQMGTKE+9pkZmYCAAoLCxW327x5M66++mpER0cjPj4e48aNw8GDBz3G89hjjwEAUlNTXedVVFSkuN8PPvgA/fv3R2RkJJKSknD77bfj+PHjru+vvfZa3HXXXQCAAQMGwGazYerUqdznGR8fj8jISLRq5WvX69ChA4YOHYrVq1d7fP7vf/8bvXv3Rq9evbiPBwB//etfYbPZcPToUZ/vnnzySYSFheHUqVMAgMOHD2PChAlwOByIiIjAxRdfjIkTJ6KiokLxGN9++y1+97vfoVOnTggPD0fHjh0xc+ZM1NTUaBoHACxduhRdunRBZGQkBg4ciG+//RbXXnstrr32Wq7zdzqdmDt3Ltq3b4+oqCgMGzYM+fn5SElJUb2HcttIjaO2thZz587FJZdcgoiICCQnJ2P8+PEoKChwbVNVVYU//vGP6NixI8LDw9G9e3f89a9/hSAIHvvatGkThgwZgvj4eLRu3Rrdu3fHU0895bFNXV0dnn32WXTt2tV1zR9//HHU1dVxXR85Vq1a5XomEhMTMXHiRPzvf/9zfT9jxgy0bt0a1dXVPr+dNGkSHA6Hx7vo888/dz27MTExGDNmDA4cOGDIWAmC8A+kSBEEETA6d+6MnTt3Ii8vT3XbefPm4Y477kBoaCiee+45zJs3Dx07dsTmzZtd26xcuRKtW7fGrFmzsHjxYvTv3x9//vOf8cQTT7i2efrpp9G3b18kJSXhnXfewTvvvINXX30VPXv2xHPPPQcAuO+++1zfDR06FECTwjJ06FBUVlbi2WefxQsvvIDTp08jMzMTO3bs8Bnv7373O1RXV+OFF17AtGnTuK+NKGy2adNGdpvs7GyMHDkSv/76K+bOnYtZs2bhu+++Q0ZGhktRGj9+PCZNmgQAWLRokeu8LrroItn9rly5ErfeeitCQkKwYMECTJs2DR9//DGGDBniyrN6+umncd999wFoCtd755138Ic//EH1vCoqKlBaWorffvsNBw4cwAMPPICzZ8/i9ttvl9x+8uTJ+PTTT3H27FkAwLlz5/DBBx9g8uTJqseS49Zbb4XNZsP777/v893777+P66+/HgkJCaivr8fIkSOxfft2PPTQQ1i6dCnuu+8+/Pzzz6r5Zh988AGqq6vxwAMP4B//+AdGjhyJf/zjH7jzzju5xwEAr732GmbMmIGLL74YL730Eq6++mrcdNNN+OWXX7jP/8knn8S8efNwxRVX4OWXX0a3bt0wcuRIVFVVce9LjsbGRtxwww2YN28e+vfvj7/97W945JFHUFFR4XreBUHA2LFjsWjRImRlZeGVV15B9+7d8dhjj2HWrFmufR04cAA33HAD6urq8Nxzz+Fvf/sbxo4d62HEcDqdGDt2LP7617/ixhtvxD/+8Q/cdNNNWLRoEW677Tbd5/OXv/wFd955J7p164ZXXnkFjz76KL766isMHTrUNRduu+02VFVVYf369R6/ra6uxqeffopbbrkFISEhAIB33nkHY8aMQevWrbFw4ULMmTMH+fn5GDJkiKqRgyAICyEQBEEEiC+//FIICQkRQkJChMGDBwuPP/648MUXXwj19fUe2x0+fFiw2+3CzTffLDQ2Nnp853Q6Xf+urq72OcYf/vAHISoqSqitrXV9NmbMGKFz584+2/7www8CAGHFihU+x+jWrZswcuRIn+OlpqYKI0aMcH327LPPCgCESZMmMV2DLVu2CACEt956S/jtt9+EEydOCOvXrxdSUlIEm80m/PDDD4IgCEJhYaHP2Pr27Su0bdtWKCsrc322d+9ewW63C3feeafrs5dfflkAIBQWFqqOp76+Xmjbtq3Qq1cvoaamxvX5Z599JgAQ/vznP7s+W7FihQDANUYlxG29/wsPDxdWrlzpsz0AYfr06UJ5ebkQFhYmvPPOO4IgCML69esFm80mFBUVua71b7/95vrdXXfdJURHR6uOZ/DgwUL//v09PtuxY4cAQPjXv/4lCIIg7N69WwAgfPDBB6r780ZqLi5YsECw2WzC0aNHucZRV1cntGnTRhgwYIDQ0NDg2m7lypUCAOGaa65hHldJSYnQqlUr4aabbvL4fO7cuQIA4a677nJ9Js7NLVu2uD7r3LmzxzYi11xzjcc43nrrLQGA8Morr/hsKz5D//nPfwQAwvPPP+/x/S233CLYbDbhyJEjgiAIwqJFi3zuszfvvPOOYLfbhW+//dbj89dff10AIOTk5Mj+1htxXokUFRUJISEhwl/+8heP7fbv3y+0atXK9bnT6RQ6dOggTJgwwWO7999/XwAgbN26VRAEQThz5owQHx8vTJs2zWO7kpISIS4uzuNz77EQBGEtyCNFEETAGDFiBHJzczF27Fjs3bsXL730EkaOHIkOHTpg3bp1ru3+85//wOl04s9//jPsds/Xlntp4MjISNe/z5w5g9LSUlx99dWorq7Gjz/+qHmce/bsweHDhzF58mSUlZWhtLQUpaWlqKqqwnXXXYetW7fC6XR6/Ob+++/nOsY999yDiy66CO3bt8eYMWNQVVWFt99+2yOfyJ3i4mLs2bMHU6dORWJiouvzPn36YMSIEdiwYQP/iQL473//i19//RU
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"df_cleaned = df.dropna()\n",
|
|||
|
"df_cleaned = df_cleaned.loc[df_cleaned[\"bmi\"] != \"N/A\"]\n",
|
|||
|
"# уберем шумы\n",
|
|||
|
"\n",
|
|||
|
"# Статистический анализ для определения выбросов\n",
|
|||
|
"Q1 = df[\"bmi\"].quantile(0.25)\n",
|
|||
|
"Q3 = df[\"bmi\"].quantile(0.75)\n",
|
|||
|
"IQR = Q3 - Q1\n",
|
|||
|
"\n",
|
|||
|
"# Определение порога для выбросов\n",
|
|||
|
"threshold = 1.5 * IQR\n",
|
|||
|
"outliers = (df[\"bmi\"] < (Q1 - threshold)) | (df[\"bmi\"] > (Q3 + threshold))\n",
|
|||
|
"\n",
|
|||
|
"# Вывод выбросов\n",
|
|||
|
"print(\"Выбросы:\")\n",
|
|||
|
"print(df[outliers])\n",
|
|||
|
"\n",
|
|||
|
"# Обработка выбросов\n",
|
|||
|
"# В данном случае мы занулим выбросы на медиану\n",
|
|||
|
"median = df[\"bmi\"].median()\n",
|
|||
|
"df.loc[outliers, \"bmi\"] = median\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация данных после обработки\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"plt.scatter(df[\"bmi\"], df[\"avg_glucose_level\"])\n",
|
|||
|
"plt.xlabel(\"bmi\")\n",
|
|||
|
"plt.ylabel(\"avg_glucose_level\")\n",
|
|||
|
"plt.title(\"Scatter Plot of BMI vs avg_glucose_level\")\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Разбиение набора данных на обучающую, контрольную и тестовую выборки"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Применение методов приращения данных (аугментации)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 277,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки: 2945\n",
|
|||
|
"Размер контрольной выборки: 982\n",
|
|||
|
"Размер тестовой выборки: 982\n",
|
|||
|
"Распределение в обучающей выборке:\n",
|
|||
|
"age\n",
|
|||
|
"37 57\n",
|
|||
|
"52 55\n",
|
|||
|
"56 54\n",
|
|||
|
"57 54\n",
|
|||
|
"53 52\n",
|
|||
|
" ..\n",
|
|||
|
"72 20\n",
|
|||
|
"68 20\n",
|
|||
|
"7 17\n",
|
|||
|
"4 14\n",
|
|||
|
"6 13\n",
|
|||
|
"Name: count, Length: 83, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение в контрольной выборке:\n",
|
|||
|
"age\n",
|
|||
|
"78 22\n",
|
|||
|
"51 21\n",
|
|||
|
"41 21\n",
|
|||
|
"18 18\n",
|
|||
|
"63 18\n",
|
|||
|
" ..\n",
|
|||
|
"9 6\n",
|
|||
|
"10 5\n",
|
|||
|
"12 5\n",
|
|||
|
"74 4\n",
|
|||
|
"67 2\n",
|
|||
|
"Name: count, Length: 83, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение в тестовой выборке:\n",
|
|||
|
"age\n",
|
|||
|
"78 25\n",
|
|||
|
"44 23\n",
|
|||
|
"54 23\n",
|
|||
|
"50 21\n",
|
|||
|
"57 21\n",
|
|||
|
" ..\n",
|
|||
|
"11 6\n",
|
|||
|
"76 5\n",
|
|||
|
"7 5\n",
|
|||
|
"77 4\n",
|
|||
|
"6 4\n",
|
|||
|
"Name: count, Length: 83, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение в обучающей выборке после oversampling:\n",
|
|||
|
"age\n",
|
|||
|
"32 57\n",
|
|||
|
"81 57\n",
|
|||
|
"42 57\n",
|
|||
|
"31 57\n",
|
|||
|
"23 57\n",
|
|||
|
" ..\n",
|
|||
|
"10 57\n",
|
|||
|
"74 57\n",
|
|||
|
"76 57\n",
|
|||
|
"4 57\n",
|
|||
|
"29 57\n",
|
|||
|
"Name: count, Length: 83, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение в контрольной выборке после oversampling:\n",
|
|||
|
"age\n",
|
|||
|
"28 22\n",
|
|||
|
"74 22\n",
|
|||
|
"30 22\n",
|
|||
|
"14 22\n",
|
|||
|
"71 22\n",
|
|||
|
" ..\n",
|
|||
|
"80 22\n",
|
|||
|
"18 22\n",
|
|||
|
"82 22\n",
|
|||
|
"65 22\n",
|
|||
|
"67 22\n",
|
|||
|
"Name: count, Length: 83, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение в тестовой выборке после oversampling:\n",
|
|||
|
"age\n",
|
|||
|
"80 25\n",
|
|||
|
"42 25\n",
|
|||
|
"66 25\n",
|
|||
|
"29 25\n",
|
|||
|
"47 25\n",
|
|||
|
" ..\n",
|
|||
|
"7 25\n",
|
|||
|
"72 25\n",
|
|||
|
"76 25\n",
|
|||
|
"34 25\n",
|
|||
|
"13 25\n",
|
|||
|
"Name: count, Length: 83, dtype: int64\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from imblearn.over_sampling import RandomOverSampler\n",
|
|||
|
"\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"\n",
|
|||
|
"# Разделение на обучающую и тестовую выборки\n",
|
|||
|
"train_df, test_df = train_test_split(df_cleaned, test_size=0.2, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение обучающей выборки на обучающую и контрольную\n",
|
|||
|
"train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"print(\"Размер обучающей выборки:\", len(train_df))\n",
|
|||
|
"print(\"Размер контрольной выборки:\", len(val_df))\n",
|
|||
|
"print(\"Размер тестовой выборки:\", len(test_df))\n",
|
|||
|
"\n",
|
|||
|
"def check_balance(df, name):\n",
|
|||
|
" counts = df[\"age\"].value_counts()\n",
|
|||
|
" print(f\"Распределение в {name}:\")\n",
|
|||
|
" print(counts)\n",
|
|||
|
" print()\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"check_balance(train_df, \"обучающей выборке\")\n",
|
|||
|
"check_balance(val_df, \"контрольной выборке\")\n",
|
|||
|
"check_balance(test_df, \"тестовой выборке\")\n",
|
|||
|
"\n",
|
|||
|
"def oversample(df):\n",
|
|||
|
" X = df.drop(\"age\", axis=1)\n",
|
|||
|
" y = df[\"age\"]\n",
|
|||
|
"\n",
|
|||
|
" oversampler = RandomOverSampler(random_state=42)\n",
|
|||
|
" X_resampled, y_resampled = oversampler.fit_resample(X, y) # type: ignore\n",
|
|||
|
"\n",
|
|||
|
" resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
|
|||
|
" return resampled_df\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"train_df_oversampled = oversample(train_df)\n",
|
|||
|
"val_df_oversampled = oversample(val_df)\n",
|
|||
|
"test_df_oversampled = oversample(test_df)\n",
|
|||
|
"\n",
|
|||
|
"check_balance(train_df_oversampled, \"обучающей выборке после oversampling\")\n",
|
|||
|
"check_balance(val_df_oversampled, \"контрольной выборке после oversampling\")\n",
|
|||
|
"check_balance(test_df_oversampled, \"тестовой выборке после oversampling\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 278,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение в обучающей выборке после oversampling:\n",
|
|||
|
"age\n",
|
|||
|
"32 57\n",
|
|||
|
"81 57\n",
|
|||
|
"42 57\n",
|
|||
|
"31 57\n",
|
|||
|
"23 57\n",
|
|||
|
" ..\n",
|
|||
|
"10 57\n",
|
|||
|
"74 57\n",
|
|||
|
"76 57\n",
|
|||
|
"4 57\n",
|
|||
|
"29 57\n",
|
|||
|
"Name: count, Length: 83, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение в контрольной выборке после oversampling:\n",
|
|||
|
"age\n",
|
|||
|
"28 22\n",
|
|||
|
"74 22\n",
|
|||
|
"30 22\n",
|
|||
|
"14 22\n",
|
|||
|
"71 22\n",
|
|||
|
" ..\n",
|
|||
|
"80 22\n",
|
|||
|
"18 22\n",
|
|||
|
"82 22\n",
|
|||
|
"65 22\n",
|
|||
|
"67 22\n",
|
|||
|
"Name: count, Length: 83, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение в тестовой выборке после oversampling:\n",
|
|||
|
"age\n",
|
|||
|
"80 25\n",
|
|||
|
"42 25\n",
|
|||
|
"66 25\n",
|
|||
|
"29 25\n",
|
|||
|
"47 25\n",
|
|||
|
" ..\n",
|
|||
|
"7 25\n",
|
|||
|
"72 25\n",
|
|||
|
"76 25\n",
|
|||
|
"34 25\n",
|
|||
|
"13 25\n",
|
|||
|
"Name: count, Length: 83, dtype: int64\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from imblearn.over_sampling import RandomOverSampler\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"def oversample(df):\n",
|
|||
|
" X = df.drop(\"age\", axis=1)\n",
|
|||
|
" y = df[\"age\"]\n",
|
|||
|
"\n",
|
|||
|
" oversampler = RandomOverSampler(random_state=42)\n",
|
|||
|
" X_resampled, y_resampled = oversampler.fit_resample(X, y) # type: ignore\n",
|
|||
|
"\n",
|
|||
|
" resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
|
|||
|
" return resampled_df\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"train_df_oversampled = oversample(train_df)\n",
|
|||
|
"val_df_oversampled = oversample(val_df)\n",
|
|||
|
"test_df_oversampled = oversample(test_df)\n",
|
|||
|
"\n",
|
|||
|
"check_balance(train_df_oversampled, \"обучающей выборке после oversampling\")\n",
|
|||
|
"check_balance(val_df_oversampled, \"контрольной выборке после oversampling\")\n",
|
|||
|
"check_balance(test_df_oversampled, \"тестовой выборке после oversampling\")"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": "aisenv",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.12.6"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|