AIM-PIbd-31-Razubaev-S-M/Lab3/lab3.ipynb

823 lines
389 KiB
Plaintext
Raw Normal View History

2024-10-19 12:54:01 +04:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Лабораторная 3"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Информация о диабете индейцев Пима"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',\n",
" 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],\n",
" dtype='object')\n"
]
}
],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"df = pd.read_csv(\".//scv//diabetes.csv\")\n",
"print(df.columns)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Столбцы на русском:\n",
"'Pregnancies' - количество беременностей\n",
"'Glucose' - уровень глюкозы\n",
"'BloodPressure'- кровяное давление\n",
"'SkinThickness' - толщина кожи\n",
"'Insulin' - уровень инсулина\n",
"'BMI' - ИМТ\n",
"'DiabetesPedigreeFunction' - функция родословной диабета\n",
"'Age' - возраст\n",
"'Outcome' - исход"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 768 entries, 0 to 767\n",
"Data columns (total 9 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Pregnancies 768 non-null int64 \n",
" 1 Glucose 768 non-null int64 \n",
" 2 BloodPressure 768 non-null int64 \n",
" 3 SkinThickness 768 non-null int64 \n",
" 4 Insulin 768 non-null int64 \n",
" 5 BMI 768 non-null float64\n",
" 6 DiabetesPedigreeFunction 768 non-null float64\n",
" 7 Age 768 non-null int64 \n",
" 8 Outcome 768 non-null int64 \n",
"dtypes: float64(2), int64(7)\n",
"memory usage: 54.1 KB\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Pregnancies</th>\n",
" <th>Glucose</th>\n",
" <th>BloodPressure</th>\n",
" <th>SkinThickness</th>\n",
" <th>Insulin</th>\n",
" <th>BMI</th>\n",
" <th>DiabetesPedigreeFunction</th>\n",
" <th>Age</th>\n",
" <th>Outcome</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>6</td>\n",
" <td>148</td>\n",
" <td>72</td>\n",
" <td>35</td>\n",
" <td>0</td>\n",
" <td>33.6</td>\n",
" <td>0.627</td>\n",
" <td>50</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>85</td>\n",
" <td>66</td>\n",
" <td>29</td>\n",
" <td>0</td>\n",
" <td>26.6</td>\n",
" <td>0.351</td>\n",
" <td>31</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>8</td>\n",
" <td>183</td>\n",
" <td>64</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>23.3</td>\n",
" <td>0.672</td>\n",
" <td>32</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>89</td>\n",
" <td>66</td>\n",
" <td>23</td>\n",
" <td>94</td>\n",
" <td>28.1</td>\n",
" <td>0.167</td>\n",
" <td>21</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>137</td>\n",
" <td>40</td>\n",
" <td>35</td>\n",
" <td>168</td>\n",
" <td>43.1</td>\n",
" <td>2.288</td>\n",
" <td>33</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
"0 6 148 72 35 0 33.6 \n",
"1 1 85 66 29 0 26.6 \n",
"2 8 183 64 0 0 23.3 \n",
"3 1 89 66 23 94 28.1 \n",
"4 0 137 40 35 168 43.1 \n",
"\n",
" DiabetesPedigreeFunction Age Outcome \n",
"0 0.627 50 1 \n",
"1 0.351 31 0 \n",
"2 0.672 32 1 \n",
"3 0.167 21 0 \n",
"4 2.288 33 1 "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.info()\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Объект наблюдения - экономика\n",
"Атрибуты - содержит набор информации об обучении, такие как:\n",
"количество беременностей, глюкоза, кровяное давление, толщина кожи, ИМТ, возраст и другие атрибуты"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA04AAAIjCAYAAAA0vUuxAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3hc1Z3/8fed3jTq1ZJlWS64N4wrYNN7SQKEElqSDdkkEMimbTYJJJtlUyDwSw9JgEBIKEvvphqwce82riq2em+j6ff3h0AgJBfJKrb0eT2Pn8dz750735kzku73nnO+xzBN00REREREREQOyjLUAYiIiIiIiBzrlDiJiIiIiIgchhInERERERGRw1DiJCIiIiIichhKnERERERERA5DiZOIiIiIiMhhKHESERERERE5DCVOIiIiIiIih6HESURERERE5DCUOImIiIiIiByGEicRETmsJ554AsMwevw3derUoQ5vxGptbeXHP/4x55xzDikpKRiGwQMPPDDUYYmIDEu2oQ5ARESOH//5n//JpEmTOh//7Gc/G8JopLa2lp/85CeMHj2aGTNm8NZbbw11SCIiw5YSJxEROWJnnnkmS5Ys6Xz8l7/8hdra2qELaITLzs6moqKCrKws1q5dy9y5c4c6JBGRYUtD9URE5LDC4TAAFsvh/2w88MADGIZBcXFx57Z4PM706dO7DSXbvHkz119/PWPHjsXlcpGVlcWNN95IXV1dl3PefvvtPQ4TtNk+vv+3ZMkSpk6dyrp161i4cCFut5uCggL++Mc/dnsvP/rRj5gzZw6JiYl4vV5OPvlk3nzzzS7HFRcXd77O008/3WVfMBgkOTkZwzD41a9+1S3OjIwMIpFIl+f885//7DzfJ5PNZ555hvPPP5+cnBycTieFhYX89Kc/JRaLHfazdjqdZGVlHfY4ERE5eupxEhGRw/oocXI6nX16/kMPPcSWLVu6bV+2bBn79u3jhhtuICsri23btvHnP/+Zbdu28f7772MYRpfj//CHP+Dz+ToffzqRa2ho4LzzzuPyyy/nyiuv5LHHHuOrX/0qDoeDG2+8EYDm5mb+8pe/cOWVV/LlL3+ZlpYW/vrXv3L22WezevVqZs6c2eWcLpeL+++/n0suuaRz25NPPkkwGDzo+21paeH555/n0ksv7dx2//3343K5uj3vgQcewOfzcdttt+Hz+XjjjTf40Y9+RHNzM7/85S8P+hoiIjK4lDiJiMhhNTU1AeB2u3v93FAoxI9+9CPOPfdcXnrppS77/v3f/51vfetbXbbNnz+fK6+8knfffZeTTz65y77Pfe5zpKWlHfS1ysvLueuuu7jtttsA+MpXvsK8efP4/ve/zxe+8AXsdjvJyckUFxfjcDg6n/flL3+ZE044gd/85jf89a9/7XLOSy+9lMcff5yqqioyMzMB+Nvf/sZnPvMZHnnkkR7juPTSS/nb3/7WmTiVlpby+uuvc8UVV/DPf/6zy7GPPPJIl8/1pptu4qabbuL3v/89//3f/93nZFVERPqXhuqJiMhhfTR0Lj09vdfP/d3vfkddXR0//vGPu+37ZMIQDAapra1l/vz5AKxfv77Xr2Wz2fjKV77S+djhcPCVr3yF6upq1q1bB4DVau1MmuLxOPX19USjUU488cQeX3P27NlMmTKFhx56CICSkhLefPNNrr/++oPGceONN/Lyyy9TWVkJwIMPPsiCBQuYMGFCt2M/+Rm0tLRQW1vLySefTCAQ4IMPPuj1ZyAiIgNDiZOIiBxWSUkJNput14lTU1MT//M//8Ntt93W2VvzSfX19dxyyy1kZmbidrtJT0+noKCg87m9lZOTg9fr7bLto2Tlk3OuHnzwQaZPn47L5SI1NZX09HReeOGFg77mDTfcwP333w90DK1buHAh48ePP2gcM2fOZOrUqfz973/HNE0eeOABbrjhhh6P3bZtG5deeimJiYn4/X7S09O55pprgL59BiIiMjCUOImIyGHt3LmTsWPHdinGcCR+/vOfY7FY+Pa3v93j/ssvv5z77ruPm266iSeffJJXX32Vl19+GejoDRoIDz/8MNdffz2FhYX89a9/5eWXX2bZsmWcdtppB33Na665hj179vD+++/z4IMPHjQJ+qQbb7yR+++/n7fffpvKykouv/zybsc0NjZy6qmnsmnTJn7yk5/w3HPPsWzZMn7+858DA/cZiIhI72mOk4iIHFIoFGLjxo1diiMcifLycu69917uvPNOEhISulXKa2ho4PXXX+eOO+7gRz/6Uef23bt39znW8vJy2trauvQ67dq1C4AxY8YAHYv5jh07lieffLJL8YmehhJ+JDU1lYsuuqhz2N/ll19+2DLsV199Nd/+9re55ZZb+NznPkdCQkK3Y9566y3q6up48sknOeWUUzq3FxUVHdH7FRGRwaMeJxEROaRHHnmEUCjE6aef3qvn3XHHHWRmZnLTTTf1uN9qtQJgmmaX7ffcc0+f4gSIRqP86U9/6nwcDof505/+RHp6OnPmzDno665atYqVK1ce8tw33ngjmzdv5rLLLutS2e9gUlJSuPjii9m8eXNnRb9P6ymWcDjM73//+8OeX0REBpd6nEREpEdtbW385je/4Sc/+QlWqxXTNHn44Ye7HFNVVUVraysPP/wwZ555Zpd5TK+++ir/+Mc/ulSv+yS/388pp5zCL37xCyKRCKNGjeLVV189qt6WnJwcfv7zn1NcXMyECRN49NFH2bhxI3/+85+x2+0AXHDBBTz55JNceumlnH/++RQVFfHHP/6RyZMn09raetBzn3POOdTU1BxR0vSRBx54gN/97ncHrQS4cOFCkpOTue6667j55psxDIOHHnqoWzJ5KL/97W9pbGykvLwcgOeee44DBw4A8I1vfIPExMQjPpeIiBycEicREelRTU0N3//+9zsff7Ja3ad94Qtf4M033+ySOM2cOZMrr7zykK/xyCOP8I1vfIPf/e53mKbJWWedxUsvvUROTk6fYk5OTubBBx/kG9/4Bvfddx+ZmZn89re/5ctf/nLnMddffz2VlZX86U9/4pVXXmHy5Mk8/PDDPP7447z11lsHPbdhGIcshd4Tt9t9yBLuqampPP/883zrW9/iv/7rv0hOTuaaa67h9NNP5+yzzz6i1/jVr35FSUlJ5+Mnn3ySJ598EuiYm6XESUSkfxhmb25riYjIiFFcXExBQQFvvvkmS5YsOerjBtqSJUuora1l69atQxaDiIgMX5rjJCIiIiIichhKnEREpEc+n4+rr766x/WX+nKciIjI8UxD9UREZFjQUD0RERlISpxEREREREQOQ0P1REREREREDkOJk4iIiIiIyGGMuHWc4vE45eXlJCQkYBjGUIcjIiIiIiJDxDRNWlpayMnJwWI5dJ/SiEucysvLycvLG+owRERERETkGLF//35yc3MPecyIS5wSEhKAjg/H7/f3+/kjkQivvvoqZ511Fna7vd/PL4NPbTo8qV2HH7Xp8KR2HX7UpsPP8dymzc3N5OXldeYIhzKkidOdd97Jk08+yQcffIDb7WbhwoX8/Oc/Z+LEiQd9zgMPPMANN9zQZZvT6SQYDB7Ra340PM/v9w9Y4uTxePD7/cfdF0d6pjYdntSuw4/adHhSuw4/atPhZzi06ZFM4RnS4hBvv/02X/va13j//fdZtmwZkUiEs846i7a2tkM+z+/3U1FR0fmvpKRkkCIWEREREZGRaEh7nF5++eUujx944AEyMjJYt24dp5xyykGfZxgGWVlZAx2eiIiIiIgIcIzNcWpqagIgJSXlkMe1traSn59PPB5n9uzZ/M///A9Tpkzp8dhQKEQoFOp83NzcDHR0KUYikX6K/GMfnXMgzi1DQ206PKldhx+16fCkdh1+1KbDz/Hcpr2J2TBN0xzAWI5YPB7noosuorGxkXffffegx61cuZLdu3czffp0mpqa+NWvfsXy5cvZtm1bj5Uwbr/9du64445u2x955BE8Hk+/vgcRERERETl+BAIBrrrqKpqamg5b/+CYSZy++tWv8tJLL/Huu+8ethTgJ0UiESZNmsSVV17
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.figure(figsize=(10, 6))\n",
"\n",
"plt.scatter(df['Age'], df['DiabetesPedigreeFunction'], c=df['Age'], alpha=0.6)\n",
"\n",
"plt.title(\"Диаграмма 1\")\n",
"plt.ylabel(\"Функция родословной диабета\")\n",
"plt.xlabel(\"Возраст\")\n",
"plt.grid(visible='true')\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA+QAAAIjCAYAAACKx9GpAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAC/oUlEQVR4nOzdd3hb9dk38O+RZEnee+9ssvckizBadsIIOwltGKVAC20hPM/bQQuBlj6UUvYKDRDCCGWVGRIge++d2HG8t7w0LOm8f0jneNsaR8v+fq4rF0SWpV88dZ97CaIoiiAiIiIiIiIiv1IF+gBEREREREREAxEDciIiIiIiIqIAYEBOREREREREFAAMyImIiIiIiIgCgAE5ERERERERUQAwICciIiIiIiIKAAbkRERERERERAHAgJyIiIiIiIgoABiQExEREREREQUAA3IiIiIiIiKiAGBATkREpIAPPvgAgiB0+2f06NGBPt6AtXPnTvzyl7/EqFGjEBkZiZycHFx//fU4ceJEoI9GREQETaAPQERE1J888sgjOO+88+S/P/bYYwE8DT355JPYvHkzrrvuOowdOxbl5eX417/+hYkTJ2Lbtm28WEJERAEliKIoBvoQREREoe6DDz7Addddhw0bNmDevHny7fPmzUN1dTUOHToUuMMNYFu2bMHkyZOh1Wrl206ePIkxY8bg2muvxVtvvRXA0xER0UDHknUiIiIFWCwWAIBK1fev1lWrVkEQBBQWFsq32e12jB07FoIgYNWqVfLtBw4cwNKlSzFo0CDo9XqkpaXh9ttvR01NTYfH/OMf/9htubxG01YMN2/ePIwePRq7d+/GzJkzER4ejvz8fLz44otd/i2///3vMWnSJMTGxiIyMhKzZ8/Ghg0bOtyvsLBQfp7//Oc/Hd5mMpkQHx8PQRDw1FNPdTlnSkoKWltbO7zPmjVr5Merrq6Wb//4449x2WWXISMjAzqdDoMHD8af//xn2Gy2Pj/WM2fO7BCMA8DQoUMxatQoHD16tM/3JyIi8iWWrBMRESlACsh1Op1H77969WocPHiwy+3ffPMNzpw5g2XLliEtLQ2HDx/Gyy+/jMOHD2Pbtm0QBKHD/V944QVERUXJf+98gaCurg6XXnoprr/+etx444147733cPfdd0Or1eL2228HADQ0NODVV1/FjTfeiOXLl6OxsRGvvfYaLrnkEuzYsQPjx4/v8Jh6vR5vvPEGrr76avm2devWwWQy9fjvbWxsxGeffYaFCxfKt73xxhvQ6/Vd3m/VqlWIiorCAw88gKioKHz33Xf4/e9/j4aGBvztb3/r8Tl6IooiKioqMGrUKLffl4iISEkMyImIiBRgMBgAAOHh4W6/r9lsxu9//3v89Kc/xRdffNHhbb/4xS/w4IMPdrht+vTpuPHGG7Fp0ybMnj27w9uuvfZaJCUl9fhcpaWl+Pvf/44HHngAAHDnnXdi2rRpWLFiBW699VaEhYUhPj4ehYWFHTLLy5cvx4gRI/Dss8/itdde6/CYCxcuxPvvv4+KigqkpqYCAF5//XUsWrQI77zzTrfnWLhwIV5//XU5IC8qKsL69euxePFirFmzpsN933nnnQ4f17vuugt33XUXnn/+efzlL39x+yLI22+/jZKSEjz66KNuvR8REZHSWLJORESkAKmEPDk52e33fe6551BTU4M//OEPXd7WPhA1mUyorq7G9OnTAQB79uxx+7k0Gg3uvPNO+e9arRZ33nknKisrsXv3bgCAWq2Wg3G73Y7a2lpYrVZMnjy52+ecOHEiRo0ahdWrVwMAzp49iw0bNmDp0qU9nuP222/Hl19+ifLycgDAm2++iRkzZmDYsGFd7tv+Y9DY2Ijq6mrMnj0bLS0tOHbsmFv//mPHjuGee+7BjBkzsGTJErfel4iISGkMyImIiBRw9uxZaDQatwNyg8GAxx9/HA888ICcXW6vtrYW999/P1JTUxEeHo7k5GTk5+fL7+uujIwMREZGdrhNCoLb97S/+eabGDt2LPR6PRITE5GcnIzPP/+8x+dctmwZ3njjDQCOEvOZM2di6NChPZ5j/PjxGD16NP79739DFEWsWrUKy5Yt6/a+hw8fxsKFCxEbG4uYmBgkJyfjlltuAeDex6C8vByXXXYZYmNj8cEHH0CtVrv8vkRERL7AgJyIiEgBx48fx6BBgzoMUXPFk08+CZVKhd/+9rfdvv3666/HK6+8grvuugvr1q3D119/jS+//BKAI3vtC2+99RaWLl2KwYMH47XXXsOXX36Jb775BhdccEGPz3nLLbfg1KlT2LZtG958880eg+v2br/9drzxxhv4/vvvUV5ejuuvv77Lferr6zF37lzs378fjz76KD799FN88803ePLJJwG4/jEwGAz46U9/ivr6enz55ZfIyMhw6f2IiIh8iT3kREREXjKbzdi3b1+HoWauKC0txTPPPIOVK1ciOjq6y+T0uro6rF+/Hn/605/w+9//Xr795MmTHp+1tLQUzc3NHbLkJ06cAADk5eUBcKxwGzRoENatW9dhaFx3JfWSxMREXHnllXL5+/XXX99hUnp3br75Zvz2t7/F/fffj2uvvRbR0dFd7rNx40bU1NRg3bp1mDNnjnx7QUGBS/9ewFHqf8UVV+DEiRP49ttvMXLkSJffl4iIyJeYISciIvLSO++8A7PZjAULFrj1fn/605+QmpqKu+66q9u3SyXVoih2uP0f//iHR+cEAKvVipdeekn+u8ViwUsvvYTk5GRMmjSpx+fdvn07tm7d2utj33777Thw4ACuu+66DpPee5KQkICrrroKBw4ckCe8d9bdWSwWC55//vk+Hx8AbDYbFi9ejK1bt+L999/HjBkzXHo/IiIif2CGnIiIyEPNzc149tln8eijj0KtVkMURbz11lsd7lNRUYGmpia89dZbuOiiizr0iX/99dd4++23u+zJlsTExGDOnDn461//itbWVmRmZuLrr792KzvcWUZGBp588kkUFhZi2LBhWLt2Lfbt24eXX34ZYWFhAIDLL78c69atw8KFC3HZZZehoKAAL774IkaOHImmpqYeH/snP/kJqqqqXArGJatWrcJzzz3X42T4mTNnIj4+HkuWLMF9990HQRCwevXqLhcpevLggw/ik08+wRVXXIHa2tounx+pF52IiCgQGJATERF5qKqqCitWrJD/3n56eWe33norNmzY0CEgHz9+PG688cZen+Odd97Bvffei+eeew6iKOLiiy/GF1984XEPdHx8PN58803ce++9eOWVV5Camop//etfWL58uXyfpUuXory8HC+99BK++uorjBw5Em+99Rbef/99bNy4scfHFgSh15Vr3QkPD+91VVxiYiI+++wzPPjgg/jf//1fxMfH45ZbbsGCBQtwySWX9Pn4+/btAwB8+umn+PTTT7u8nQE5EREFkiC6eomZiIiIOigsLER+fj42bNiAefPmeX0/X5s3bx6qq6tx6NChgJ2BiIiI2rCHnIiIiIiIiCgAGJATERF5KCoqCjfffHO3+8M9uR8RERENLCxZJyIiGiBYsk5ERBRcGJATERERERERBQBL1omIiIiIiIgCgAE5ERERERERUQD0+z3kdrsdpaWliI6OhiAIgT4OERERERER9XOiKKKxsREZGRlQqXrOg/f7gLy0tBTZ2dmBPgYRERERERENMOfOnUNWVlaPb+/3AXl0dDQAxwciJiYmwKchIiIiIiKi/q6hoQHZ2dlyPNqTfh+QS2XqMTExDMiJiIiIiIjIb/pqm+ZQNyIiIiIiIqIAYEBOREREREREFAAMyImIiIiIiIgCgAE5ERERERERUQAwICciIiIiIiIKAAbkRERERERERAHAgJyIiIiIiIgoABiQExEREREREQUAA3IiIiIiIiKiAGBATkRERERERBQADMiJiIiIiIiIAoABOREREREREVEAMCAnIiIiIiIiCgAG5EREREREREQBwICciIiIiIiIKAAYkBMRkU+UGYzYcroaZQZjoI9CREREFJQ0gT4AERH1P2t3FmHFuoOwi4BKAFYuGoPFU3ICfSwiIiK
"text/plain": [
"<Figure size 1200x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df_dependence = df.groupby('Age')['BMI'].mean().reset_index()\n",
"\n",
"plt.figure(figsize=(12, 6))\n",
"\n",
"plt.plot(df_dependence['Age'], df_dependence['BMI'], marker='.')\n",
"\n",
"plt.title(\"Диаграмма 2\")\n",
"plt.xlabel(\"Возраст\")\n",
"plt.ylabel(\"ИМТ\")\n",
"\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Присутствует связь между атрибутами, уровень инфляции влияет и зависит от многих атрибутов.\n",
"Для примера на графике приведена связь между инфляцией и доходом на душу населения. На втором графике показана связь уровня ВВП и безработицы\n",
"Примеры бизнес целей\n",
"\n",
" 1.Прогнозирование уровня инфляции на основе уровня ВВП.\n",
" 2.Наблюдение за изменениями уровня безработицы с уровнем ВВП.\n",
" \n",
"Эффект для бизнеса: влияние на инвестиции индекса акций и цен на нефть, исследование влияния фондового индекса на инвестиции, исследования инфляции и покупательской способности.\n",
"Цели технического проекта\n",
"\n",
"Для первой цели:\n",
"\n",
"Вход: Доход на душу населения\n",
"Целевой признак: Уровень инфляции.\n",
"\n",
"Для второй цели:\n",
"\n",
"Вход: Уровень безработицы\n",
"Целевой признак: Уровень ВВП"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Проверка на выбросы"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Пропущенные значения по столбцам:\n",
"Pregnancies 0\n",
"Glucose 0\n",
"BloodPressure 0\n",
"SkinThickness 0\n",
"Insulin 0\n",
"BMI 0\n",
"DiabetesPedigreeFunction 0\n",
"Age 0\n",
"Outcome 0\n",
"dtype: int64\n",
"\n",
"Статистический обзор данных:\n",
" Pregnancies Glucose BloodPressure SkinThickness Insulin \\\n",
"count 768.000000 768.000000 768.000000 768.000000 768.000000 \n",
"mean 3.845052 120.894531 69.105469 20.536458 79.799479 \n",
"std 3.369578 31.972618 19.355807 15.952218 115.244002 \n",
"min 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
"25% 1.000000 99.000000 62.000000 0.000000 0.000000 \n",
"50% 3.000000 117.000000 72.000000 23.000000 30.500000 \n",
"75% 6.000000 140.250000 80.000000 32.000000 127.250000 \n",
"max 17.000000 199.000000 122.000000 99.000000 846.000000 \n",
"\n",
" BMI DiabetesPedigreeFunction Age Outcome \n",
"count 768.000000 768.000000 768.000000 768.000000 \n",
"mean 31.992578 0.471876 33.240885 0.348958 \n",
"std 7.884160 0.331329 11.760232 0.476951 \n",
"min 0.000000 0.078000 21.000000 0.000000 \n",
"25% 27.300000 0.243750 24.000000 0.000000 \n",
"50% 32.000000 0.372500 29.000000 0.000000 \n",
"75% 36.600000 0.626250 41.000000 1.000000 \n",
"max 67.100000 2.420000 81.000000 1.000000 \n"
]
}
],
"source": [
"null_values = df.isnull().sum()\n",
"print(\"Пропущенные значения по столбцам:\")\n",
"print(null_values)\n",
"\n",
"stat_summary = df.describe()\n",
"print(\"\\nСтатистический обзор данных:\")\n",
"print(stat_summary)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"На основе данных выше можно выделить что нулевых данных нет\n",
"Также проверим данные на выбросы и дубликаты:"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Коэффициент асимметрии для столбца 'Pregnancies': 0.9016739791518588\n",
"\n",
"Коэффициент асимметрии для столбца 'Glucose': 0.17375350179188992\n",
"\n",
"Коэффициент асимметрии для столбца 'BloodPressure': -1.8436079833551302\n",
"\n",
"Коэффициент асимметрии для столбца 'SkinThickness': 0.10937249648187608\n",
"\n",
"Коэффициент асимметрии для столбца 'Insulin': 2.272250858431574\n",
"\n",
"Коэффициент асимметрии для столбца 'BMI': -0.42898158845356543\n",
"\n",
"Коэффициент асимметрии для столбца 'DiabetesPedigreeFunction': 1.919911066307204\n",
"\n",
"Коэффициент асимметрии для столбца 'Age': 1.1295967011444805\n",
"\n",
"Коэффициент асимметрии для столбца 'Outcome': 0.635016643444986\n",
"\n",
"Количество дубликатов: 0\n"
]
}
],
"source": [
"for column in df.select_dtypes(include=[np.number]).columns:\n",
" skewness = df[column].skew()\n",
" print(f\"\\nКоэффициент асимметрии для столбца '{column}': {skewness}\")\n",
"\n",
"duplicates = df.duplicated().sum()\n",
"print(f\"\\nКоличество дубликатов: {duplicates}\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"На основе данных выше можно сказать, что для столбцов выбросы незначительны. Дупликатов нет"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Очистка данных от шумов:"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA0kAAAIjCAYAAADWYVDIAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAC02klEQVR4nOzdeXhU1fkH8O9kX8jKNkGBBFwBhYJbUHAvFgttbUt/LtStiFvrvmC1BK2idW+roFRBwbVWLSjFWleU4IJihahVSHAhQUM2SEhCMvf3R5hxtjtz7r3nrvP9PA9P68y957znPefe5CST+/oURVFAREREREREAIA0uwMgIiIiIiJyEm6SiIiIiIiIwnCTREREREREFIabJCIiIiIiojDcJBEREREREYXhJomIiIiIiCgMN0lERERERERhuEkiIiIiIiIKw00SERERERFRGG6SiIiIiGz09ddfY8mSJaH/rqurw2OPPWZfQETETRIRGfPMM8/A5/PF/TdmzBi7wyMicjyfz4eLLroIL730Eurq6nD11Vdj9erVdodFlNIy7A6AiLzhuuuuw4EHHhj675tvvtnGaIiI3GOvvfbCrFmzcNJJJwEAysrK8Prrr9sbFFGK8ymKotgdBBG51zPPPINf/vKXeO2113DMMceEXj/mmGPQ2NiIDRs22BccEZGLbNq0CY2NjRgzZgzy8/PtDocopfHjdkRkSHd3NwAgLS357WTJkiXw+Xyoq6sLvRYIBHDwwQfD5/NFfCb/v//9L8466yyMGDECOTk58Pv9OOecc7B9+/aINquqquJ+1C8j4/tflB9zzDEYM2YM1q1bh4kTJyI3NxcVFRVYuHBhzFj+8Ic/YMKECSgqKkJ+fj4mTZqE1157LeK4urq6UD/PP/98xHudnZ0oKSmBz+fDHXfcERPnoEGDsHv37ohznnjiiVB7jY2Nodf/+c9/4uSTT8aQIUOQnZ2NkSNH4qabbkJvb2/SXAf7+/TTTzFjxgwUFhaif//+uOSSS9DZ2Rlx7OLFi3Hcccdh0KBByM7OxqhRo7BgwYK47f7rX//C0UcfjYKCAhQWFuLQQw/F448/HnHMO++8g6lTp6KkpAT5+fk4+OCDce+990Yc8+mnn+IXv/gFSktLkZOTg0MOOQTLly+POEbLejnrrLMi5r+kpATHHHNMzEeWRHMaXDPR7rjjjpiYysvLcdZZZ0Uc9/e//x0+nw/l5eURr3/77bc499xzMWzYMKSnp4fi7devX0xf0crLy1U/2urz+WKOX7ZsGSZMmIDc3FyUlpbi//7v//DVV1/FHWeyawMAurq6MHfuXOyzzz7Izs7G0KFDcfXVV6Orqyvm2Ndff104zmjBtRtv/OF51rI+AISuhYEDByI3Nxf7778/fv/730f0mehf8Dc7xxxzTMQPhIC+35ynpaXFXAt///vfQ3MwYMAAnHHGGfjmm28ijjnrrLNC62TkyJE4/PDD0dTUhNzc3JjxEZF1+HE7IjIkuEnKzs7Wdf7SpUvx8ccfx7z+8ssvY/PmzTj77LPh9/uxceNGPPjgg9i4cSPWrl0b803UggULIr7RjN60NTc3Y+rUqZgxYwZOPfVUPP3007jggguQlZWFc845BwDQ1taGv/3tbzj11FMxa9Ys7NixAw899BCmTJmCd999F+PGjYtoMycnB4sXL8ZPf/rT0GvPPvtszCYk3I4dO/DCCy/gZz/7Wei1xYsXIycnJ+a8JUuWoF+/frj88svRr18/vPrqq/jDH/6AtrY23H777ap9hJsxYwbKy8sxf/58rF27Fn/+85/R3NyMRx99NCJ3o0ePxvTp05GRkYEVK1bgwgsvRCAQwEUXXRQRzznnnIPRo0djzpw5KC4uxocffohVq1bhtNNOA9A3bz/+8Y9RVlaGSy65BH6/H5988gleeOEFXHLJJQCAjRs34sgjj8Ree+2Fa6+9Fvn5+Xj66afx05/+FP/4xz8ichNNbb0AwIABA3D33XcD6PtD+HvvvRdTp07FV199heLiYmk5Taanpyf0zXe0M888E//5z3/w29/+FmPHjkV6ejoefPBBfPDBB0Jtjxs3DldccUXEa48++ihefvnliNduvvlm3HDDDZgxYwZ+85vf4LvvvsNf/vIXTJ48GR9++GEoH4DYtREIBDB9+nS89dZbOO+883DggQfi448/xt13343//e9/MT8sCPrd736HQw89VDVO2dTWx3//+19MmjQJmZmZOO+881BeXo5NmzZhxYoVuPnmm3HKKadgn332CR1/2WWX4cADD8R5550Xei3848ThFi9ejOuvvx533nln6DoA+tba2WefjUMPPRTz58/Htm3bcO+99+Ltt9+OmYNof/jDHxLeR4jIAgoRkQH33HOPAkD56KOPIl4/+uijldGjR0e8tnjxYgWAUltbqyiKonR2dirDhg1TfvSjHykAlMWLF4eO7ejoiOnriSeeUAAob775Zui1uXPnKgCU7777TjXGo48+WgGg3HnnnaHXurq6lHHjximDBg1Suru7FUVRlJ6eHqWrqyvi3ObmZmXw4MHKOeecE3qttrZWAaCceuqpSkZGhtLQ0BB67/jjj1dOO+00BYBy++23x8R56qmnKj/+8Y9Dr2/ZskVJS0tTTj311JhxxMvB7Nmzlby8PKWzs1N1vOH9TZ8+PeL1Cy+8MGa+4vUzZcoUZcSIEaH/bmlpUQoKCpTDDz9c2bVrV8SxgUBAUZS+/FVUVCjDhw9Xmpub4x6jKH05OuiggyLGEAgElIkTJyr77rtv6DUt6+XMM89Uhg8fHtHngw8+qABQ3n333YRjjZfTeOtXURTl9ttvj4hJURRl+PDhyplnnhn67/vvv1/Jzs5Wjj322IiYdu3apaSlpSmzZ8+OaPPMM89U8vPzY/qKNnz4cOXkk0+Oef2iiy5Swr+c19XVKenp6crNN98ccdzHH3+sZGRkRLwuem0sXbpUSUtLU1avXh3R5sKFCxUAyttvvx3x+r///W8FgPLMM8+oxqlm3rx5CoCINRMcf3ietayPyZMnKwUFBcqWLVsi2ozuQ62vcEcffbRy9NFHK4qiKC+++KKSkZGhXHHFFRHHdHd3K4MGDVLGjBkTcb288MILCgDlD3/4Q+i16LW7YcMGJS0tLTSO8LVGRNbhx+2IyJDgx98GDhyo+dz77rsP27dvx9y5c2Pey83NDf3/zs5ONDY24ogjjgAA4Z+6h8vIyMDs2bND/52VlYXZs2fj22+/xbp16wAA6enpyMrKAtD3k/Ompib09PTgkEMOidvn+PHjMXr0aCxduhQAsGXLFrz22msxH70Kd84552DVqlVoaGgAADzyyCOorKzEfvvtF3NseA527NiBxsZGTJo0CR0dHfj000+Fxh3+myAA+O1vfwsAWLlyZdx+Wltb0djYiKOPPhqbN29Ga2srgL7fEO3YsQPXXnstcnJyItoM/lbvww8/RG1tLS699NKYn5IHj2lqasKrr76KGTNmhMbU2NiI7du3Y8qUKfj8889jPo4UlGi9AH1zFmxv/fr1ePTRR1FWVhbxGwAtOe3t7Q21F/zX0dERt++gjo4O3Hjjjbj44osxbNiwiPfa29sRCATQv3//hG0Y9eyzzyIQCGDGjBkRsfv9fuy7774xHx8VuTb+/ve/48ADD8QBBxwQ0eZxxx0HADFtBn8LEr1WRAwaNAhA328DtVBbH9999x3efPNNnHPOOTFzIvLxPzXvvvsuZsyYgZ///Ocxv4V8//338e233+LCCy+MyMHJJ5+MAw44AC+++KJqu3PmzMH48ePxy1/+UndsRGQcP25HRIZs2bIFGRkZmjdJra2tuOWWW3D55Zdj8ODBMe83NTVh3rx5ePLJJ/Htt9/GnKvVkCFDYv4QOrgxqaurC23AHnnkEdx555349NNPI/52qKKiIm67Z599Nh588EFceeWVWLJkCSZOnIh9991XNY5x48ZhzJgxePTRR3HVVVdhyZIluO6662L+VgTo+1ja9ddfj1dffRVtbW0R74nmIDqWkSNHIi0tLeLvHN5++23MnTsX1dXVMZuA1tZWFBUVYdO
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Выбросы в датасете:\n",
" Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
"131 9 122 56 0 0.0 33.3 \n",
"152 9 156 86 28 155.0 34.3 \n",
"267 2 128 64 42 0.0 40.0 \n",
"270 10 101 86 37 0.0 45.6 \n",
"314 7 109 80 31 0.0 35.9 \n",
"408 8 197 74 0 0.0 25.9 \n",
"416 1 97 68 21 0.0 27.2 \n",
"434 1 90 68 8 0.0 24.5 \n",
"487 0 173 78 32 265.0 46.5 \n",
"493 4 125 70 18 122.0 28.9 \n",
"588 3 176 86 27 156.0 33.3 \n",
"657 1 120 80 48 200.0 38.9 \n",
"744 13 153 88 37 140.0 40.6 \n",
"747 1 81 74 41 57.0 46.3 \n",
"750 4 136 70 0 0.0 31.2 \n",
"\n",
" DiabetesPedigreeFunction Age Outcome \n",
"131 1.114 33 1 \n",
"152 1.189 42 1 \n",
"267 1.101 24 0 \n",
"270 1.136 38 1 \n",
"314 1.127 43 1 \n",
"408 1.191 39 1 \n",
"416 1.095 22 0 \n",
"434 1.138 36 0 \n",
"487 1.159 58 0 \n",
"493 1.144 45 1 \n",
"588 1.154 52 1 \n",
"657 1.162 41 0 \n",
"744 1.174 39 0 \n",
"747 1.096 32 0 \n",
"750 1.182 22 1 \n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA0kAAAIjCAYAAADWYVDIAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAC09UlEQVR4nOzdeXhU1fkH8O9M9oSsbAkCScA1oCAoEiyIVgtFoWotrVWKSxGtWneRuhC0grau/VVBUAHBrXUrKKXuSjG4gFghiggJqCRgCEkgIQnJ3N8fYYaZyZ2Zu5y7zvfzPDyPztw55z3vOfdOTmZyX48kSRKIiIiIiIgIAOC1OgAiIiIiIiI74SaJiIiIiIgoCDdJREREREREQbhJIiIiIiIiCsJNEhERERERURBukoiIiIiIiIJwk0RERERERBSEmyQiIiIiIqIg3CQREREREREF4SaJiIiIiIgoCDdJRGSIl156CR6PR/bf4MGDrQ6PiIiIKKJEqwMgInf705/+hOOOOy7w//fee6+F0RARERHFxk0SERnqrLPOwtixYwP//+STT6K2tta6gIiIiIhi4NftiMgQbW1tAACvN/ZlZvHixfB4PKiqqgo85vP5cMIJJ8Dj8WDx4sWBx//3v//hkksuwYABA5Camor8/Hxcdtll2LNnT0ibZWVlsl/1S0w8/LuhsWPHYvDgwVi3bh1GjRqFtLQ0FBcXY/78+V3Gctddd2H48OHIzs5GRkYGRo8ejffeey/kuKqqqkA/r732WshzLS0tyM3NhcfjwQMPPNAlzl69euHgwYMhr3n++ecD7QVvLP/1r3/h7LPPRp8+fZCSkoKBAwfinnvuQUdHR8xc+/v7+uuvMXnyZGRlZaF79+647rrr0NLSEnLsokWLcMYZZ6BXr15ISUlBSUkJ5s2bJ9vuv//9b5x22mnIzMxEVlYWTj75ZDz33HMhx3z88ceYMGECcnNzkZGRgRNOOAGPPvpoyDFff/01LrjgAuTl5SE1NRUnnXQSli9fHnKMmvVyySWXhMx/bm4uxo4di9WrV4e0qTSn/jUT7oEHHugSU1FRES655JKQ4/75z3/C4/GgqKgo5PHdu3fj8ssvR//+/ZGQkBCIt1u3bl36CldUVBTxq60ejyfk2Pb2dtxzzz0YOHAgUlJSUFRUhD/96U9obW3t0q6SOQ1e89H69fl8eOSRRzBo0CCkpqaid+/emD59Ovbu3atofOF5fP/99+HxePD+++8HHhs7dmzIL2QA4NNPP5WNBwCWLVuGESNGID09Hbm5uRgzZgzefPPNQJ/RcuqfP//4g9fcvn37MHz4cBQXF6O6ujricQBw9dVXw+PxdBkfEVmPnyQRkSH8m6SUlBRNr1+6dCm+/PLLLo+/9dZb2LZtGy699FLk5+dj06ZNWLBgATZt2oS1a9d2+WFo3rx5IT9ohm/a9u7diwkTJmDy5Mm48MIL8Y9//ANXXXUVkpOTcdlllwEAGhsb8eSTT+LCCy/EtGnTsG/fPjz11FMYN24cPvnkEwwdOjSkzdTUVCxatAjnnntu4LFXXnmlyyYk2L59+/D666/jvPPOCzy2aNEipKamdnnd4sWL0a1bN9x4443o1q0b3n33Xdx1111obGzEX//614h9BJs8eTKKioowd+5crF27Fn/729+wd+9ePPPMMyG5GzRoECZNmoTExESsWLECf/jDH+Dz+XD11VeHxHPZZZdh0KBBmDlzJnJycvD5559j1apV+O1vfwugc97OOeccFBQU4LrrrkN+fj6++uorvP7667juuusAAJs2bcKpp56KI444ArfddhsyMjLwj3/8A+eeey5efvnlkNyEi7ReAKBHjx54+OGHAQDff/89Hn30UUyYMAHfffcdcnJyhOU0lvb2dtx+++2yz02dOhVvv/02rr32WgwZMgQJCQlYsGAB1q9fr6jtoUOH4qabbgp57JlnnsFbb70V8tjvf/97LFmyBBdccAFuuukmfPzxx5g7dy6++uorvPrqq4HjlMxpsCuuuAKjR48G0LnWg9sCgOnTp2Px4sW49NJL8cc//hGVlZX4+9//js8//xxr1qxBUlKSonGqNWPGDNnHZ8+ejbKyMowaNQp33303kpOT8fHHH+Pdd9/Fz372MzzyyCPYv38/AOCrr77CnDlzQr46HGnzevDgQfzyl7/Ejh07sGbNGhQUFESM7dtvv8XChQt1jpCIDCMRERngkUcekQBIX3zxRcjjp512mjRo0KCQxxYtWiQBkCorKyVJkqSWlhapf//+0s9//nMJgLRo0aLAsc3NzV36ev755yUA0ocffhh4bNasWRIA6ccff4wY42mnnSYBkB588MHAY62trdLQoUOlXr16SW1tbZIkSVJ7e7vU2toa8tq9e/dKvXv3li677LLAY5WVlRIA6cILL5QSExOlmpqawHM//elPpd/+9rcSAOmvf/1rlzgvvPBC6Zxzzgk8vn37dsnr9UoXXnhhl3HI5WD69OlSenq61NLSEnG8wf1NmjQp5PE//OEPXeZLrp9x48ZJAwYMCPx/fX29lJmZKZ1yyinSgQMHQo71+XySJHXmr7i4WCosLJT27t0re4wkdebo+OOPDxmDz+eTRo0aJR111FGBx9Ssl6lTp0qFhYUhfS5YsEACIH3yySdRxyqXU7n1K0mS9Ne//jUkJkmSpMLCQmnq1KmB/3/88cellJQU6fTTTw+J6cCBA5LX65WmT58e0ubUqVOljIyMLn2FKywslM4+++wuj1999dVS8Nv8hg0bJADS73//+5Djbr75ZgmA9O6770qSpGxO/bZs2SIBkJYsWRJ4zL/G/FavXi0BkJ599tmQ165atUr28XDFxcXS7373u5DH3nvvPQmA9N577wUeO+2006TTTjst8P8rV66UAEjjx48PiWfLli2S1+uVzjvvPKmjoyPq+CL15ec/5xctWiT5fD7poosuktLT06WPP/444nF+kydPlgYPHiz169cvZJ0QkT3w63ZEZAj/19969uyp+rWPPfYY9uzZg1mzZnV5Li0tLfDfLS0tqK2txciRIwFA8W/dgyUmJmL69OmB/09OTsb06dOxe/durFu3DgCQkJCA5ORkAJ1fG6qrq0N7eztOOukk2T6HDRuGQYMGYenSpQCA7du347333ov6lZrLLrsMq1atQk1NDQBgyZIlKC0txdFHH93l2OAc7Nu3D7W1tRg9ejSam5vx9ddfKxp38CdBAHDttdcCAFauXCnbT0NDA2pra3Haaadh27ZtaGhoAND5CdG+fftw2223ITU1NaRN/6d6n3/+OSorK3H99dcHPrkJP6aurg7vvvsuJk+eHBhTbW0t9uzZg3HjxmHLli344YcfZMcSbb0AnXPmb2/Dhg145plnUFBQEHJDETU57ejoCLTn/9fc3Czbt19zczPuvvtuXHPNNejfv3/Ic01NTfD5fOjevXvUNvTyz+2NN94Y8rj/E6g33ngDgLI59VPyifE///lPZGdn46yzzgrJ2fDhw9GtW7cuX1sN16tXL3z//fcKRniYJEmYOXMmfvnLX+KUU04Jee61116Dz+fDXXfd1eWTZbmv5Sl1yy234Nlnn8U//vEPjBgxIuqx69atwz//+U/MnTtX0VeSich8PDOJyBDbt29HYmKi6k1SQ0MD5syZgxtvvBG9e/fu8nxdXR2uu+469O7dG2lpaejZsyeKi4sDr1WrT58+yMjICHnMvzEJ/vuSJUuW4IQTTkBqaiq6d++Onj174o033ojY56WXXopFixYB6Pzq0qhRo3DUUUdFjGPo0KEYPHgwnnnmGUiSFPhqkpxNmzbhvPPOQ3Z2NrKystCzZ09cfPHFAJTnIDyWgQMHwuv1hox5zZo1OPPMM5GRkYGcnBz07NkTf/rTn0L62bp1KwBEva27kmO+/fZbSJKEO++8Ez179gz559/87N69u8vrYq0XAPjuu+8CbZ144onYunUrXn755ZCvTKnJ6ddffx0xxkgeeughtLS0BPIXrHv37jjqqKPw5JNP4s0338Tu3btRW1sr+3dCemzfvh1erxdHHnlkyOP5+fnIycnB9u3
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.figure(figsize=(10, 6))\n",
"plt.scatter(df['DiabetesPedigreeFunction'], df['Age'])\n",
"plt.xlabel('Функция родословной диабета')\n",
"plt.ylabel('Возраст')\n",
"plt.title('Диаграмма рассеивания перед чисткой')\n",
"plt.show()\n",
"\n",
"Q1 = df[\"DiabetesPedigreeFunction\"].quantile(0.25)\n",
"Q3 = df[\"DiabetesPedigreeFunction\"].quantile(0.75)\n",
"\n",
"IQR = Q3 - Q1\n",
"\n",
"threshold = 1.5 * IQR\n",
"lower_bound = Q1 - threshold\n",
"upper_bound = Q3 + threshold\n",
"\n",
"outliers = (df[\"DiabetesPedigreeFunction\"] < lower_bound) | (df[\"DiabetesPedigreeFunction\"] > upper_bound)\n",
"\n",
"# Вывод выбросов\n",
"print(\"Выбросы в датасете:\")\n",
"print(df[outliers])\n",
"\n",
"# Заменяем выбросы на медианные значения\n",
"median_score = df[\"DiabetesPedigreeFunction\"].median()\n",
"df.loc[outliers, \"DiabetesPedigreeFunction\"] = median_score\n",
"\n",
"# Визуализация данных после обработки\n",
"plt.figure(figsize=(10, 6))\n",
"plt.scatter(df['DiabetesPedigreeFunction'], df['Age'])\n",
"plt.xlabel('Функция родословной диабета')\n",
"plt.ylabel('Возраст')\n",
"plt.title('Диаграмма рассеивания после чистки')\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Разбиение набора данных на обучающую, контрольную и тестовую выборки"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Размер обучающей выборки: 460\n",
"Размер контрольной выборки: 154\n",
"Размер тестовой выборки: 154\n"
]
}
],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)\n",
"\n",
"train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)\n",
"\n",
"print(\"Размер обучающей выборки:\", len(train_df))\n",
"print(\"Размер контрольной выборки:\", len(val_df))\n",
"print(\"Размер тестовой выборки:\", len(test_df))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Видим недостаток баланса"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение функции родословной диабета в обучающей выборке:\n",
"DiabetesPedigreeFunction\n",
"0.37250 12\n",
"0.37175 10\n",
"0.25800 5\n",
"0.19700 4\n",
"0.23800 4\n",
" ..\n",
"0.52600 1\n",
"0.60000 1\n",
"0.45400 1\n",
"0.70400 1\n",
"0.46300 1\n",
"Name: count, Length: 332, dtype: int64\n",
"\n",
"Распределение функции родословной диабета в контрольной выборке:\n",
"DiabetesPedigreeFunction\n",
"0.37250 10\n",
"0.14200 3\n",
"0.37175 3\n",
"0.25400 3\n",
"0.14100 2\n",
" ..\n",
"0.24500 1\n",
"0.69300 1\n",
"0.12100 1\n",
"0.68600 1\n",
"0.12200 1\n",
"Name: count, Length: 130, dtype: int64\n",
"\n",
"Распределение функции родословной диабета в тестовой выборке:\n",
"DiabetesPedigreeFunction\n",
"0.3725 7\n",
"0.1480 2\n",
"0.4430 2\n",
"0.2070 2\n",
"0.5200 2\n",
" ..\n",
"0.2480 1\n",
"0.2360 1\n",
"0.3020 1\n",
"0.4850 1\n",
"0.7050 1\n",
"Name: count, Length: 134, dtype: int64\n",
"\n"
]
}
],
"source": [
"def check_balance(df, name):\n",
" counts = df['DiabetesPedigreeFunction'].value_counts()\n",
" print(f\"Распределение функции родословной диабета в {name}:\")\n",
" print(counts)\n",
" print()\n",
"\n",
"check_balance(train_df, \"обучающей выборке\")\n",
"check_balance(val_df, \"контрольной выборке\")\n",
"check_balance(test_df, \"тестовой выборке\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"также используем oversampling и undersampling"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Оверсэмплинг:\n",
"Распределение функции родословной диабета в обучающей выборке:\n",
"DiabetesPedigreeFunction\n",
"0.37250 12\n",
"0.37175 10\n",
"0.25800 5\n",
"0.19700 4\n",
"0.23800 4\n",
" ..\n",
"0.52600 1\n",
"0.60000 1\n",
"0.45400 1\n",
"0.70400 1\n",
"0.46300 1\n",
"Name: count, Length: 332, dtype: int64\n",
"\n",
"Распределение функции родословной диабета в контрольной выборке:\n",
"DiabetesPedigreeFunction\n",
"0.3725 10\n",
"0.4390 4\n",
"0.1420 3\n",
"0.7300 3\n",
"0.2540 3\n",
" ..\n",
"0.2450 1\n",
"0.6930 1\n",
"0.1210 1\n",
"0.6860 1\n",
"0.1220 1\n",
"Name: count, Length: 130, dtype: int64\n",
"\n",
"Распределение функции родословной диабета в тестовой выборке:\n",
"DiabetesPedigreeFunction\n",
"0.3725 7\n",
"0.1480 2\n",
"0.4430 2\n",
"0.2070 2\n",
"0.5200 2\n",
" ..\n",
"0.2480 1\n",
"0.2360 1\n",
"0.3020 1\n",
"0.4850 1\n",
"0.7050 1\n",
"Name: count, Length: 134, dtype: int64\n",
"\n",
"Андерсэмплинг:\n",
"Распределение функции родословной диабета в обучающей выборке:\n",
"DiabetesPedigreeFunction\n",
"0.37250 12\n",
"0.37175 10\n",
"0.25800 5\n",
"0.29900 4\n",
"0.19700 4\n",
" ..\n",
"0.38100 1\n",
"0.30000 1\n",
"0.13300 1\n",
"0.23400 1\n",
"0.59800 1\n",
"Name: count, Length: 332, dtype: int64\n",
"\n",
"Распределение функции родословной диабета в контрольной выборке:\n",
"DiabetesPedigreeFunction\n",
"0.37250 6\n",
"0.14200 3\n",
"0.37175 3\n",
"0.25400 3\n",
"0.28400 2\n",
" ..\n",
"0.55900 1\n",
"0.69200 1\n",
"0.42300 1\n",
"0.69300 1\n",
"0.68600 1\n",
"Name: count, Length: 117, dtype: int64\n",
"\n",
"Распределение функции родословной диабета в тестовой выборке:\n",
"DiabetesPedigreeFunction\n",
"0.3725 7\n",
"0.1480 2\n",
"0.2070 2\n",
"0.2590 2\n",
"0.3700 2\n",
" ..\n",
"0.8400 1\n",
"0.5250 1\n",
"0.5360 1\n",
"0.8550 1\n",
"0.6990 1\n",
"Name: count, Length: 134, dtype: int64\n",
"\n"
]
}
],
"source": [
"from imblearn.over_sampling import RandomOverSampler\n",
"from imblearn.under_sampling import RandomUnderSampler\n",
"\n",
"def binning(target, bins):\n",
" return pd.qcut(target, q=bins, labels=False)\n",
"\n",
"train_df['gdppercent_binned'] = binning(train_df['DiabetesPedigreeFunction'], bins=2)\n",
"val_df['gdppercent_binned'] = binning(val_df['DiabetesPedigreeFunction'], bins=2)\n",
"test_df['gdppercent_binned'] = binning(test_df['DiabetesPedigreeFunction'], bins=2)\n",
"\n",
"def oversample(df, target_column):\n",
" X = df.drop(target_column, axis=1)\n",
" y = df[target_column]\n",
" \n",
" oversampler = RandomOverSampler(random_state=42)\n",
" x_resampled, y_resampled = oversampler.fit_resample(X, y) # type: ignore\n",
" \n",
" resampled_df = pd.concat([x_resampled, y_resampled], axis=1) \n",
" return resampled_df\n",
"\n",
"def undersample(df, target_column):\n",
" X = df.drop(target_column, axis=1)\n",
" y = df[target_column]\n",
" \n",
" undersampler = RandomUnderSampler(random_state=42)\n",
" x_resampled, y_resampled = undersampler.fit_resample(X, y) # type: ignore\n",
" \n",
" resampled_df = pd.concat([x_resampled, y_resampled], axis=1)\n",
" return resampled_df\n",
"\n",
"train_df_oversampled = oversample(train_df, 'gdppercent_binned')\n",
"val_df_oversampled = oversample(val_df, 'gdppercent_binned')\n",
"test_df_oversampled = oversample(test_df, 'gdppercent_binned')\n",
"\n",
"train_df_undersampled = undersample(train_df, 'gdppercent_binned')\n",
"val_df_undersampled = undersample(val_df, 'gdppercent_binned')\n",
"test_df_undersampled = undersample(test_df, 'gdppercent_binned')\n",
"\n",
"print(\"Оверсэмплинг:\")\n",
"check_balance(train_df_oversampled, \"обучающей выборке\")\n",
"check_balance(val_df_oversampled, \"контрольной выборке\")\n",
"check_balance(test_df_oversampled, \"тестовой выборке\")\n",
"\n",
"print(\"Андерсэмплинг:\")\n",
"check_balance(train_df_undersampled, \"обучающей выборке\")\n",
"check_balance(val_df_undersampled, \"контрольной выборке\")\n",
"check_balance(test_df_undersampled, \"тестовой выборке\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}