AIM-PIbd-31-Razubaev-S-M/Lab3/lab3.ipynb

1074 lines
396 KiB
Plaintext
Raw Permalink Normal View History

2024-10-19 12:54:01 +04:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Лабораторная 3"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Информация о диабете индейцев Пима"
]
},
{
"cell_type": "code",
2024-10-25 20:15:35 +04:00
"execution_count": 1,
2024-10-19 12:54:01 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',\n",
" 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],\n",
" dtype='object')\n"
]
}
],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"df = pd.read_csv(\".//scv//diabetes.csv\")\n",
"print(df.columns)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Столбцы на русском:\n",
"'Pregnancies' - количество беременностей\n",
"'Glucose' - уровень глюкозы\n",
"'BloodPressure'- кровяное давление\n",
"'SkinThickness' - толщина кожи\n",
"'Insulin' - уровень инсулина\n",
"'BMI' - ИМТ\n",
"'DiabetesPedigreeFunction' - функция родословной диабета\n",
"'Age' - возраст\n",
"'Outcome' - исход"
]
},
{
"cell_type": "code",
2024-10-25 20:15:35 +04:00
"execution_count": 2,
2024-10-19 12:54:01 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 768 entries, 0 to 767\n",
"Data columns (total 9 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Pregnancies 768 non-null int64 \n",
" 1 Glucose 768 non-null int64 \n",
" 2 BloodPressure 768 non-null int64 \n",
" 3 SkinThickness 768 non-null int64 \n",
" 4 Insulin 768 non-null int64 \n",
" 5 BMI 768 non-null float64\n",
" 6 DiabetesPedigreeFunction 768 non-null float64\n",
" 7 Age 768 non-null int64 \n",
" 8 Outcome 768 non-null int64 \n",
"dtypes: float64(2), int64(7)\n",
"memory usage: 54.1 KB\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Pregnancies</th>\n",
" <th>Glucose</th>\n",
" <th>BloodPressure</th>\n",
" <th>SkinThickness</th>\n",
" <th>Insulin</th>\n",
" <th>BMI</th>\n",
" <th>DiabetesPedigreeFunction</th>\n",
" <th>Age</th>\n",
" <th>Outcome</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>6</td>\n",
" <td>148</td>\n",
" <td>72</td>\n",
" <td>35</td>\n",
" <td>0</td>\n",
" <td>33.6</td>\n",
" <td>0.627</td>\n",
" <td>50</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>85</td>\n",
" <td>66</td>\n",
" <td>29</td>\n",
" <td>0</td>\n",
" <td>26.6</td>\n",
" <td>0.351</td>\n",
" <td>31</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>8</td>\n",
" <td>183</td>\n",
" <td>64</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>23.3</td>\n",
" <td>0.672</td>\n",
" <td>32</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>89</td>\n",
" <td>66</td>\n",
" <td>23</td>\n",
" <td>94</td>\n",
" <td>28.1</td>\n",
" <td>0.167</td>\n",
" <td>21</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>137</td>\n",
" <td>40</td>\n",
" <td>35</td>\n",
" <td>168</td>\n",
" <td>43.1</td>\n",
" <td>2.288</td>\n",
" <td>33</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
"0 6 148 72 35 0 33.6 \n",
"1 1 85 66 29 0 26.6 \n",
"2 8 183 64 0 0 23.3 \n",
"3 1 89 66 23 94 28.1 \n",
"4 0 137 40 35 168 43.1 \n",
"\n",
" DiabetesPedigreeFunction Age Outcome \n",
"0 0.627 50 1 \n",
"1 0.351 31 0 \n",
"2 0.672 32 1 \n",
"3 0.167 21 0 \n",
"4 2.288 33 1 "
]
},
2024-10-25 20:15:35 +04:00
"execution_count": 2,
2024-10-19 12:54:01 +04:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.info()\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
2024-10-25 20:14:00 +04:00
"Объект наблюдения - здоровье пациентов\n",
"\n",
2024-10-19 12:54:01 +04:00
"Атрибуты - содержит набор информации об обучении, такие как:\n",
"количество беременностей, глюкоза, кровяное давление, толщина кожи, ИМТ, возраст и другие атрибуты"
]
},
{
"cell_type": "code",
2024-10-25 20:15:35 +04:00
"execution_count": 3,
2024-10-19 12:54:01 +04:00
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA04AAAIjCAYAAAA0vUuxAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3hc1Z3/8fed3jTq1ZJlWS64N4wrYNN7SQKEElqSDdkkEMimbTYJJJtlUyDwSw9JgEBIKEvvphqwce82riq2em+j6ff3h0AgJBfJKrb0eT2Pn8dz750735kzku73nnO+xzBN00REREREREQOyjLUAYiIiIiIiBzrlDiJiIiIiIgchhInERERERGRw1DiJCIiIiIichhKnERERERERA5DiZOIiIiIiMhhKHESERERERE5DCVOIiIiIiIih6HESURERERE5DCUOImIiIiIiByGEicRETmsJ554AsMwevw3derUoQ5vxGptbeXHP/4x55xzDikpKRiGwQMPPDDUYYmIDEu2oQ5ARESOH//5n//JpEmTOh//7Gc/G8JopLa2lp/85CeMHj2aGTNm8NZbbw11SCIiw5YSJxEROWJnnnkmS5Ys6Xz8l7/8hdra2qELaITLzs6moqKCrKws1q5dy9y5c4c6JBGRYUtD9URE5LDC4TAAFsvh/2w88MADGIZBcXFx57Z4PM706dO7DSXbvHkz119/PWPHjsXlcpGVlcWNN95IXV1dl3PefvvtPQ4TtNk+vv+3ZMkSpk6dyrp161i4cCFut5uCggL++Mc/dnsvP/rRj5gzZw6JiYl4vV5OPvlk3nzzzS7HFRcXd77O008/3WVfMBgkOTkZwzD41a9+1S3OjIwMIpFIl+f885//7DzfJ5PNZ555hvPPP5+cnBycTieFhYX89Kc/JRaLHfazdjqdZGVlHfY4ERE5eupxEhGRw/oocXI6nX16/kMPPcSWLVu6bV+2bBn79u3jhhtuICsri23btvHnP/+Zbdu28f7772MYRpfj//CHP+Dz+ToffzqRa2ho4LzzzuPyyy/nyiuv5LHHHuOrX/0qDoeDG2+8EYDm5mb+8pe/cOWVV/LlL3+ZlpYW/vrXv3L22WezevVqZs6c2eWcLpeL+++/n0suuaRz25NPPkkwGDzo+21paeH555/n0ksv7dx2//3343K5uj3vgQcewOfzcdttt+Hz+XjjjTf40Y9+RHNzM7/85S8P+hoiIjK4lDiJiMhhNTU1AeB2u3v93FAoxI9+9CPOPfdcXnrppS77/v3f/51vfetbXbbNnz+fK6+8knfffZeTTz65y77Pfe5zpKWlHfS1ysvLueuuu7jtttsA+MpXvsK8efP4/ve/zxe+8AXsdjvJyckUFxfjcDg6n/flL3+ZE044gd/85jf89a9/7XLOSy+9lMcff5yqqioyMzMB+Nvf/sZnPvMZHnnkkR7juPTSS/nb3/7WmTiVlpby+uuvc8UVV/DPf/6zy7GPPPJIl8/1pptu4qabbuL3v/89//3f/93nZFVERPqXhuqJiMhhfTR0Lj09vdfP/d3vfkddXR0//vGPu+37ZMIQDAapra1l/vz5AKxfv77Xr2Wz2fjKV77S+djhcPCVr3yF6upq1q1bB4DVau1MmuLxOPX19USjUU488cQeX3P27NlMmTKFhx56CICSkhLefPNNrr/++oPGceONN/Lyyy9TWVkJwIMPPsiCBQuYMGFCt2M/+Rm0tLRQW1vLySefTCAQ4IMPPuj1ZyAiIgNDiZOIiBxWSUkJNput14lTU1MT//M//8Ntt93W2VvzSfX19dxyyy1kZmbidrtJT0+noKCg87m9lZOTg9fr7bLto2Tlk3OuHnzwQaZPn47L5SI1NZX09HReeOGFg77mDTfcwP333w90DK1buHAh48ePP2gcM2fOZOrUqfz973/HNE0eeOABbrjhhh6P3bZtG5deeimJiYn4/X7S09O55pprgL59BiIiMjCUOImIyGHt3LmTsWPHdinGcCR+/vOfY7FY+Pa3v93j/ssvv5z77ruPm266iSeffJJXX32Vl19+GejoDRoIDz/8MNdffz2FhYX89a9/5eWXX2bZsmWcdtppB33Na665hj179vD+++/z4IMPHjQJ+qQbb7yR+++/n7fffpvKykouv/zybsc0NjZy6qmnsmnTJn7yk5/w3HPPsWzZMn7+858DA/cZiIhI72mOk4iIHFIoFGLjxo1diiMcifLycu69917uvPNOEhISulXKa2ho4PXXX+eOO+7gRz/6Uef23bt39znW8vJy2trauvQ67dq1C4AxY8YAHYv5jh07lieffLJL8YmehhJ+JDU1lYsuuqhz2N/ll19+2DLsV199Nd/+9re55ZZb+NznPkdCQkK3Y9566y3q6up48sknOeWUUzq3FxUVHdH7FRGRwaMeJxEROaRHHnmEUCjE6aef3qvn3XHHHWRmZnLTTTf1uN9qtQJgmmaX7ffcc0+f4gSIRqP86U9/6nwcDof505/+RHp6OnPmzDno665atYqVK1ce8tw33ngjmzdv5rLLLutS2e9gUlJSuPjii9m8eXNnRb9P6ymWcDjM73//+8OeX0REBpd6nEREpEdtbW385je/4Sc/+QlWqxXTNHn44Ye7HFNVVUVraysPP/wwZ555Zpd5TK+++ir/+Mc/ulSv+yS/388pp5zCL37xCyKRCKNGjeLVV189qt6WnJwcfv7zn1NcXMyECRN49NFH2bhxI3/+85+x2+0AXHDBBTz55JNceumlnH/++RQVFfHHP/6RyZMn09raetBzn3POOdTU1BxR0vSRBx54gN/97ncHrQS4cOFCkpOTue6667j55psxDIOHHnqoWzJ5KL/97W9pbGykvLwcgOeee44DBw4A8I1vfIPExMQjPpeIiBycEicREelRTU0N3//+9zsff7Ja3ad94Qtf4M033+ySOM2cOZMrr7zykK/xyCOP8I1vfIPf/e53mKbJWWedxUsvvUROTk6fYk5OTubBBx/kG9/4Bvfddx+ZmZn89re/5ctf/nLnMddffz2VlZX86U9/4pVXXmHy5Mk8/PDDPP7447z11lsHPbdhGIcshd4Tt9t9yBLuqampPP/883zrW9/iv/7rv0hOTuaaa67h9NNP5+yzzz6i1/jVr35FSUlJ5+Mnn3ySJ598EuiYm6XESUSkfxhmb25riYjIiFFcXExBQQFvvvkmS5YsOerjBtqSJUuora1l69atQxaDiIgMX5rjJCIiIiIichhKnEREpEc+n4+rr766x/WX+nKciIjI8UxD9UREZFjQUD0RERlISpxEREREREQOQ0P1REREREREDkOJk4iIiIiIyGGMuHWc4vE45eXlJCQkYBjGUIcjIiIiIiJDxDRNWlpayMnJwWI5dJ/SiEucysvLycvLG+owRERERETkGLF//35yc3MPecyIS5wSEhKAjg/H7/f3+/kjkQivvvoqZ511Fna7vd/PL4NPbTo8qV2HH7Xp8KR2HX7UpsPP8dymzc3N5OXldeYIhzKkidOdd97Jk08+yQcffIDb7WbhwoX8/Oc/Z+LEiQd9zgMPPMANN9zQZZvT6SQYDB7Ra340PM/v9w9Y4uTxePD7/cfdF0d6pjYdntSuw4/adHhSuw4/atPhZzi06ZFM4RnS4hBvv/02X/va13j//fdZtmwZkUiEs846i7a2tkM+z+/3U1FR0fmvpKRkkCIWEREREZGRaEh7nF5++eUujx944AEyMjJYt24dp5xyykGfZxgGWVlZAx2eiIiIiIgIcIzNcWpqagIgJSXlkMe1traSn59PPB5n9uzZ/M///A9Tpkzp8dhQKEQoFOp83NzcDHR0KUYikX6K/GMfnXMgzi1DQ206PKldhx+16fCkdh1+1KbDz/Hcpr2J2TBN0xzAWI5YPB7noosuorGxkXffffegx61cuZLdu3czffp0mpqa+NWvfsXy5cvZtm1bj5Uwbr/9du64445u2x955BE8Hk+/vgcRERERETl+BAIBrrrqKpqamg5b/+CYSZy++tWv8tJLL/Huu+8ethTgJ0UiESZNmsSVV17
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.figure(figsize=(10, 6))\n",
"\n",
"plt.scatter(df['Age'], df['DiabetesPedigreeFunction'], c=df['Age'], alpha=0.6)\n",
"\n",
"plt.title(\"Диаграмма 1\")\n",
"plt.ylabel(\"Функция родословной диабета\")\n",
"plt.xlabel(\"Возраст\")\n",
"plt.grid(visible='true')\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "code",
2024-10-25 20:15:35 +04:00
"execution_count": 4,
2024-10-19 12:54:01 +04:00
"metadata": {},
"outputs": [
{
"data": {
2024-10-25 20:14:00 +04:00
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA+QAAAIjCAYAAACKx9GpAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACwvElEQVR4nOzdd3hUdb4/8PeZnjrpPSGhC6EXKYrY27VhWyuKfdUtevfexd/evdvu4ra7VV3XAihi20VX3WsXUECk9w4J6QlpkzJ95vz+mDknk5Ayvb5fz8OjTKZ8CSGZz/k0QRRFEUREREREREQUVopIH4CIiIiIiIgoETEgJyIiIiIiIooABuREREREREREEcCAnIiIiIiIiCgCGJATERERERERRQADciIiIiIiIqIIYEBOREREREREFAEMyImIiIiIiIgigAE5ERERERERUQQwICciIiIiIiKKAAbkREREQfD3v/8dgiAM+quysjLSx0tY27dvx2OPPYbJkycjJSUFZWVluOWWW3Ds2LFIH42IiAiqSB+AiIgonjz11FM455xz5N//z//8TwRPQ7/61a+wefNm3HzzzZg6dSqamprwl7/8BTNnzsTWrVt5sYSIiCJKEEVRjPQhiIiIYt3f//533HzzzVi/fj0WL14s37548WK0trbiwIEDkTtcAtuyZQtmz54NjUYj33b8+HFMmTIFN910E9asWRPB0xERUaJjyToREVEQWK1WAIBCMfKP1lWrVkEQBFRXV8u3OZ1OTJ06FYIgYNWqVfLt+/btwz333IPRo0dDp9OhoKAAy5YtQ1tbW7/n/MlPfjJoubxK1VcMt3jxYlRWVmLnzp1YsGABkpKSUFFRgb/+9a9n/Vl+/OMfY9asWdDr9UhJScH555+P9evX97tfdXW1/Drvvvtuv4+ZzWZkZmZCEAT89re/PeuceXl5sNls/R7z+uuvy8/X2toq3/7Pf/4TV199NYqKiqDVajFmzBj8/Oc/h8PhGPFzvWDBgn7BOACMGzcOkydPxuHDh0d8PBERUSixZJ2IiCgIpIBcq9X69fhXX30V+/fvP+v2Tz/9FKdOncK9996LgoICHDx4EH/7299w8OBBbN26FYIg9Lv/c889h9TUVPn3Ay8QdHR04KqrrsItt9yC2267DW+99RYeeeQRaDQaLFu2DADQ1dWFF198EbfddhseeOABdHd346WXXsLll1+Obdu2Yfr06f2eU6fTYeXKlbj++uvl29atWwez2Tzkn7e7uxsffPABbrjhBvm2lStXQqfTnfW4VatWITU1FU888QRSU1PxxRdf4Mc//jG6urrwm9/8ZsjXGIooimhubsbkyZN9fiwREVEwMSAnIiIKAoPBAABISkry+bEWiwU//vGPceWVV+LDDz/s97Fvf/vbePLJJ/vdNm/ePNx2223YtGkTzj///H4fu+mmm5CTkzPkazU0NOB3v/sdnnjiCQDAQw89hHPPPRfLly/HXXfdBbVajczMTFRXV/fLLD/wwAOYOHEi/vznP+Oll17q95w33HAD3n77bTQ3NyM/Px8A8PLLL2PJkiVYu3btoOe44YYb8PLLL8sBeU1NDT7//HPceuuteP311/vdd+3atf0+rw8//DAefvhhPPvss/jFL37h80WQ1157DfX19fjZz37m0+OIiIiCjSXrREREQSCVkOfm5vr82GeeeQZtbW347//+77M+5hmIms1mtLa2Yt68eQCAXbt2+fxaKpUKDz30kPx7jUaDhx56CC0tLdi5cycAQKlUysG40+lEe3s77HY7Zs+ePehrzpw5E5MnT8arr74KADh9+jTWr1+Pe+65Z8hzLFu2DB999BGampoAAKtXr8b8+fMxfvz4s+7r+Tno7u5Ga2srzj//fBiNRhw5csSnP/+RI0fw6KOPYv78+Vi6dKlPjyUiIgo2BuRERERBcPr0aahUKp8DcoPBgF/+8pd44okn5Oyyp/b2dnz3u99Ffn4+kpKSkJubi4qKCvmxvioqKkJKSkq/26Qg2LOnffXq1Zg6dSp0Oh2ys7ORm5uLf/3rX0O+5r333ouVK1cCcJWYL1iwAOPGjRvyHNOnT0dlZSVeeeUViKKIVatW4d577x30vgcPHsQNN9wAvV6P9PR05Obm4s477wTg2+egqakJV199NfR6Pf7+979DqVR6/VgiIqJQYEBOREQUBEePHsXo0aP7DVHzxq9+9SsoFAr84Ac/GPTjt9xyC1544QU8/PDDWLduHT755BN89NFHAFzZ61BYs2YN7rnnHowZMwYvvfQSPvroI3z66ae46KKLhnzNO++8EydOnMDWrVuxevXqIYNrT8uWLcPKlSuxceNGNDU14ZZbbjnrPp2dnbjggguwd+9e/OxnP8P777+PTz/9FL/61a8AeP85MBgMuPLKK9HZ2YmPPvoIRUVFXj2OiIgolNhDTkREFCCLxYI9e/b0G2rmjYaGBvzxj3/EihUrkJaWdtbk9I6ODnz++ef46U9/ih//+Mfy7cePH/f7rA0NDejt7e2XJT927BgAoLy8HIBrhdvo0aOxbt26fkPjBiupl2RnZ+Paa6+Vy99vueWWfpPSB3PHHXfgBz/4Ab773e/ipptuQlpa2ln32bBhA9ra2rBu3TosWrRIvr2qqsqrPy/gKvW/5pprcOzYMXz22WeYNGmS148lIiIKJWbIiYiIArR27VpYLBZcfPHFPj3upz/9KfLz8/Hwww8P+nGppFoUxX63/+EPf/DrnABgt9vx/PPPy7+3Wq14/vnnkZubi1mzZg35ut988w2+/vrrYZ972bJl2LdvH26++eZ+k96HkpWVheuuuw779u2TJ7wPNNhZrFYrnn322RGfHwAcDgduvfVWfP3113j77bcxf/58rx5HREQUDsyQExER+am3txd//vOf8bOf/QxKpRKiKGLNmjX97tPc3Iyenh6sWbMGl156ab8+8U8++QSvvfbaWXuyJenp6Vi0aBF+/etfw2azobi4GJ988olP2eGBioqK8Ktf/QrV1dUYP3483nzzTezZswd/+9vfoFarAQD/9m//hnXr1uGGG27A1VdfjaqqKvz1r3/FpEmT0NPTM+RzX3HFFThz5oxXwbhk1apVeOaZZ4acDL9gwQJkZmZi6dKl+M53vgNBEPDqq6+edZFiKE8++STee+89XHPNNWhvbz/r70fqRSciIooEBuRERER+OnPmDJYvXy7/3nN6+UB33XUX1q9f3y8gnz59Om677bZhX2Pt2rV4/PHH8cwzz0AURVx22WX48MMP/e6BzszMxOrVq/H444/jhRdeQH5+Pv7yl7/ggQcekO9zzz33oKmpCc8//zw+/vhjTJo0CWvWrMHbb7+NDRs2DPncgiAMu3JtMElJScOuisvOzsYHH3yAJ598Ej/60Y+QmZmJO++8ExdffDEuv/zyEZ9/z549AID3338f77///lkfZ0BORESRJIjeXmImIiKifqqrq1FRUYH169dj8eLFAd8v1BYvXozW1lYcOHAgYmcgIiKiPuwhJyIiIiIiIooABuRERER+Sk1NxR133DHo/nB/7kdERESJhSXrRERECYIl60RERNGFATkRERERERFRBLBknYiIiIiIiCgCGJATERERERERRUDc7yF3Op1oaGhAWloaBEGI9HGIiIiIiIgozomiiO7ubhQVFUGhGDoPHvcBeUNDA0pLSyN9DCIiIiIiIkowtbW1KCkpGfLjcR+Qp6WlAXB9ItLT0yN8GiIiIiIiIop3XV1dKC0tlePRocR9QC6VqaenpzMgJyIiIiIiorAZqW2aQ92IiIiIiIiIIoABOREREREREVEEMCAnIiIiIiIiigAG5EREREREREQRwICciIiIiIiIKAIYkBMRERERERFFAANyIiIiIiIioghgQE5EREREREQUAQzIiYiIiIiIiCKAATkRERERERFRBDAgJyIiIiIiIooABuREREREREREEcCAnIiIiIiIiCgCGJATERERERERRQADciIiIiIiIqIIYEBOREQh0WgwYcvJVjQaTJE+ChEREVFUUkX6AEREFH/e3F6D5ev2wykCCgFYsWQ
2024-10-19 12:54:01 +04:00
"text/plain": [
"<Figure size 1200x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df_dependence = df.groupby('Age')['BMI'].mean().reset_index()\n",
"\n",
"plt.figure(figsize=(12, 6))\n",
"\n",
"plt.plot(df_dependence['Age'], df_dependence['BMI'], marker='.')\n",
"\n",
"plt.title(\"Диаграмма 2\")\n",
"plt.xlabel(\"Возраст\")\n",
"plt.ylabel(\"ИМТ\")\n",
"\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
2024-10-25 20:14:00 +04:00
"Присутствует связь между атрибутами, возраст влияет на многие атрибуты в данном датасете.\n",
"Для примера на первом графике приведена связь между функцией родословной диабета и возрастом. На втором графике показана связь возраста и ИМТ\n",
2024-10-19 12:54:01 +04:00
"Примеры бизнес целей\n",
"\n",
2024-10-25 20:14:00 +04:00
" 1.Прогнозирование шанса диабета на основе функции и возраста.\n",
" 2.Улучшение диагностики диабета.\n",
2024-10-19 12:54:01 +04:00
" \n",
2024-10-25 20:14:00 +04:00
"Эффект для бизнеса: снижение затрат на лечение, улучшение качества обслуживания.\n",
2024-10-19 12:54:01 +04:00
"Цели технического проекта\n",
"\n",
"Для первой цели:\n",
"\n",
2024-10-25 20:14:00 +04:00
"Вход: возраст\n",
"Целевой признак: Наличие диабета.\n",
2024-10-19 12:54:01 +04:00
"\n",
"Для второй цели:\n",
"\n",
2024-10-25 20:14:00 +04:00
"Вход: глюкоза, давление\n",
"Целевой признак: Наличие диабета."
2024-10-19 12:54:01 +04:00
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Проверка на выбросы"
]
},
{
"cell_type": "code",
2024-10-25 20:15:35 +04:00
"execution_count": 5,
2024-10-19 12:54:01 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Пропущенные значения по столбцам:\n",
"Pregnancies 0\n",
"Glucose 0\n",
"BloodPressure 0\n",
"SkinThickness 0\n",
"Insulin 0\n",
"BMI 0\n",
"DiabetesPedigreeFunction 0\n",
"Age 0\n",
"Outcome 0\n",
"dtype: int64\n",
"\n",
"Статистический обзор данных:\n",
" Pregnancies Glucose BloodPressure SkinThickness Insulin \\\n",
"count 768.000000 768.000000 768.000000 768.000000 768.000000 \n",
"mean 3.845052 120.894531 69.105469 20.536458 79.799479 \n",
"std 3.369578 31.972618 19.355807 15.952218 115.244002 \n",
"min 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
"25% 1.000000 99.000000 62.000000 0.000000 0.000000 \n",
"50% 3.000000 117.000000 72.000000 23.000000 30.500000 \n",
"75% 6.000000 140.250000 80.000000 32.000000 127.250000 \n",
"max 17.000000 199.000000 122.000000 99.000000 846.000000 \n",
"\n",
" BMI DiabetesPedigreeFunction Age Outcome \n",
"count 768.000000 768.000000 768.000000 768.000000 \n",
"mean 31.992578 0.471876 33.240885 0.348958 \n",
"std 7.884160 0.331329 11.760232 0.476951 \n",
"min 0.000000 0.078000 21.000000 0.000000 \n",
"25% 27.300000 0.243750 24.000000 0.000000 \n",
"50% 32.000000 0.372500 29.000000 0.000000 \n",
"75% 36.600000 0.626250 41.000000 1.000000 \n",
"max 67.100000 2.420000 81.000000 1.000000 \n"
]
}
],
"source": [
"null_values = df.isnull().sum()\n",
"print(\"Пропущенные значения по столбцам:\")\n",
"print(null_values)\n",
"\n",
"stat_summary = df.describe()\n",
"print(\"\\nСтатистический обзор данных:\")\n",
"print(stat_summary)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"На основе данных выше можно выделить что нулевых данных нет\n",
"Также проверим данные на выбросы и дубликаты:"
]
},
{
"cell_type": "code",
2024-10-25 20:15:35 +04:00
"execution_count": 6,
2024-10-19 12:54:01 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Коэффициент асимметрии для столбца 'Pregnancies': 0.9016739791518588\n",
"\n",
"Коэффициент асимметрии для столбца 'Glucose': 0.17375350179188992\n",
"\n",
"Коэффициент асимметрии для столбца 'BloodPressure': -1.8436079833551302\n",
"\n",
"Коэффициент асимметрии для столбца 'SkinThickness': 0.10937249648187608\n",
"\n",
"Коэффициент асимметрии для столбца 'Insulin': 2.272250858431574\n",
"\n",
"Коэффициент асимметрии для столбца 'BMI': -0.42898158845356543\n",
"\n",
"Коэффициент асимметрии для столбца 'DiabetesPedigreeFunction': 1.919911066307204\n",
"\n",
"Коэффициент асимметрии для столбца 'Age': 1.1295967011444805\n",
"\n",
"Коэффициент асимметрии для столбца 'Outcome': 0.635016643444986\n",
"\n",
"Количество дубликатов: 0\n"
]
}
],
"source": [
"for column in df.select_dtypes(include=[np.number]).columns:\n",
" skewness = df[column].skew()\n",
" print(f\"\\nКоэффициент асимметрии для столбца '{column}': {skewness}\")\n",
"\n",
"duplicates = df.duplicated().sum()\n",
"print(f\"\\nКоличество дубликатов: {duplicates}\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"На основе данных выше можно сказать, что для столбцов выбросы незначительны. Дупликатов нет"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Очистка данных от шумов:"
]
},
{
"cell_type": "code",
2024-10-25 20:15:35 +04:00
"execution_count": 7,
2024-10-19 12:54:01 +04:00
"metadata": {},
"outputs": [
{
"data": {
2024-10-25 20:14:00 +04:00
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA0kAAAIjCAYAAADWYVDIAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACrd0lEQVR4nOzdeXwTZf4H8E/Su6UnR1MQSsGDS0HwoCCHigvqgq66rAgI6grLoiuKCuxPRLwQb9dFEA9QEO8TD1wEBGULrBwuyLFYS0FsQFraQksPmvn9USbmmmRmMpOZST7v14vXS5LJM9955kmchzzziU0QBAFEREREREQEALAbXQAREREREZGZcJJERERERETkgZMkIiIiIiIiD5wkEREREREReeAkiYiIiIiIyAMnSURERERERB44SSIiIiIiIvLASRIREREREZEHTpKIiIiIiIg8cJJEREREZKCff/4Zixcvdv993759eOONN4wriIg4SSKi8Lz33nuw2WwB//To0cPo8oiITM9ms2Hy5Mn48ssvsW/fPtx777345ptvjC6LKKbFG10AEUWHv//97+jatav774888oiB1RARWUe7du1w6623YtiwYQCAvLw8fP3118YWRRTjbIIgCEYXQUTW9d577+GPf/wj1qxZg8GDB7sfHzx4MI4cOYIdO3YYVxwRkYUUFxfjyJEj6NGjB9LS0owuhyimcbkdEYWloaEBAGC3h/44Wbx4MWw2G/bt2+d+zOVy4ZxzzoHNZvNak//f//4X48ePR6dOnZCcnAyHw4Gbb74Z5eXlXm0+8MADAZf6xcf/9kX54MGD0aNHD2zevBn9+vVDSkoKCgoKsGDBAr9juf/++9GnTx9kZmYiLS0NAwYMwJo1a7y227dvn3s/H330kddzdXV1yM7Ohs1mw5NPPulXZ5s2bdDY2Oj1mjfffNPd3pEjR9yPf/zxx7jyyivRtm1bJCUloXPnznjooYfQ1NQUsq/F/e3evRsjR45ERkYGWrZsiTvuuAN1dXVe2y5atAiXXHIJ2rRpg6SkJHTr1g3z588P2O4XX3yBQYMGIT09HRkZGTj//POxbNkyr202btyIK664AtnZ2UhLS8M555yD5557zmub3bt347rrrkNOTg6Sk5Nx3nnn4ZNPPvHaRsl4GT9+vNf5z87OxuDBg/2WLMntU3HM+HryySf9aurYsSPGjx/vtd27774Lm82Gjh07ej1++PBh3HLLLejQoQPi4uLc9bZo0cJvX746duwoubTVZrP5bb906VL06dMHKSkpyMnJwfXXX48DBw4EPM5Q7w0AqK+vx6xZs3D66acjKSkJ7du3x7333ov6+nq/bb/++mvZdfoSx26g4/fsZyXjA4D7vdC6dWukpKTgrLPOwv/93/957TPYH/GbncGDB3v9gxDQ/M253W73ey+8++677nPQqlUrjBkzBgcPHvTaZvz48e5x0rlzZ1x44YWoqKhASkqK3/ERUeRwuR0RhUWcJCUlJal6/ZIlS7B9+3a/x1euXImffvoJN910ExwOB3744QcsXLgQP/zwAzZs2OB3ETV//nyvC03fSdvRo0dxxRVXYOTIkRg1ahTeeecdTJo0CYmJibj55psBANXV1Xj55ZcxatQo3HrrrTh27BheeeUVDB06FJs2bUKvXr282kxOTsaiRYtw9dVXux/74IMP/CYhno4dO4ZPP/0Uf/jDH9yPLVq0CMnJyX6vW7x4MVq0aIG77roLLVq0wOrVq3H//fejuroaTzzxhOQ+PI0cORIdO3bEnDlzsGHDBvzjH//A0aNH8frrr3v1Xffu3TFixAjEx8dj+fLl+Otf/wqXy4XJkyd71XPzzTeje/fumDFjBrKysrB161asWLECN9xwA4Dm8/b73/8eeXl5uOOOO+BwOLBr1y58+umnuOOOOwAAP/zwA/r374927dph+vTpSEtLwzvvvIOrr74a77//vlff+JIaLwDQqlUrPPPMMwCab4R/7rnncMUVV+DAgQPIysrSrE9DOXnypPvi29e4cePw1Vdf4fbbb0fPnj0RFxeHhQsXYsuWLbLa7tWrF6ZOner12Ouvv46VK1d6PfbII49g5syZGDlyJP785z/j119/xfPPP4+BAwdi69at7v4A5L03XC4XRowYgW+//RYTJkxA165dsX37djzzzDP43//+5/ePBaK//e1vOP/88yXr1JrU+Pjvf/+LAQMGICEhARMmTEDHjh1RXFyM5cuX45FHHsE111yD008/3b39nXfeia5du2LChAnuxzyXE3tatGgR7rvvPjz11FPu9wHQPNZuuukmnH/++ZgzZw4OHTqE5557DuvXr/c7B77uv//+oJ8jRBQBAhFRGJ599lkBgPD99997PT5o0CChe/fuXo8tWrRIACCUlJQIgiAIdXV1QocOHYTLL79cACAsWrTIvW1tba3fvt58800BgLBu3Tr3Y7NmzRIACL/++qtkjYMGDRIACE899ZT7sfr6eqFXr15CmzZthIaGBkEQBOHkyZNCfX2912uPHj0q5ObmCjfffLP7sZKSEgGAMGrUKCE+Pl5wOp3u5y699FLhhhtuEAAITzzxhF+do0aNEn7/+9+7Hy8tLRXsdrswatQov+MI1AcTJ04UUlNThbq6Osnj9dzfiBEjvB7/61//6ne+Au1n6NChQqdOndx/r6ysFNLT04ULL7xQOHHihNe2LpdLEITm/isoKBDy8/OFo0ePBtxGEJr76Oyzz/Y6BpfLJfTr108444wz3I8pGS/jxo0T8vPzvfa5cOFCAYCwadOmoMcaqE8DjV9BEIQnnnjCqyZBEIT8/Hxh3Lhx7r+/8MILQlJSknDxxRd71XTixAnBbrcLEydO9Gpz3LhxQlpamt++fOXn5wtXXnml3+OTJ08WPP93vm/fPiEuLk545JFHvLbbvn27EB8f7/W43PfGkiVLBLvdLnzzzTdebS5YsEAAIKxfv97r8X/9618CAOG9996TrFPK7NmzBQBeY0Y8fs9+VjI+Bg4cKKSnpwulpaVebfruQ2pfngYNGiQMGjRIEARB+Oyzz4T4+Hhh6tSpXts0NDQIbdq0EXr06OH1fvn0008FAML999/vfsx37O7YsUOw2+3u4/Aca0QUOVxuR0RhEZe/tW7dWvFr582bh/LycsyaNcvvuZSUFPd/19XV4ciRI+jbty8AyP5Xd0/x8fGYOHGi+++JiYmYOHEiDh8+jM2bNwMA4uLikJiYCKD5X84rKipw8uRJnHfeeQH32bt3b3Tv3h1LliwBAJSWlmLNmjV+S6883XzzzVixYgWcTicA4LXXXkNhYSHOPPNMv209++DYsWM4cuQIBgwYgNraWuzevVvWcXt+EwQAt99+OwDg888/D7ifqqoqHDlyBIMGDcJPP/2EqqoqAM3fEB07dgzTp09HcnKyV5vit3pbt25FSUkJpkyZ4vev5OI2FRUVWL16NUaOHOk+piNHjqC8vBxDhw7F3r17/ZYjiYKNF6D5nIntbdu2Da+//jry8vK8vgFQ0qdNTU3u9sQ/tbW1Afctqq2txYMPPojbbrsNHTp08HqupqYGLpcLLVu2DNpGuD744AO4XC6MHDnSq3aHw4EzzjjDb/monPfGu+++i65du6JLly5ebV5yySUA4Nem+C2I71iRo02bNgCavw1UQmp8/Prrr1i3bh1uvvlmv3MiZ/mflE2bNmHkyJG49tpr/b6F/O6773D48GH89a9/9eqDK6+8El26dMFnn30m2e6MGTPQu3dv/PGPf1RdGxGFj8vtiCgspaWliI+PVzxJqqqqwqOPPoq77roLubm5fs9XVFRg9uzZeOutt3D48GG/1yrVtm1bvxuhxYnJvn373BOw1157DU899RR2797tde9QQUFBwHZvuukmLFy4EHfffTcWL16Mfv364YwzzpCso1evXujRowdef/113HPPPVi8eDH+/ve/+90rAjQvS7vvvvuwevVqVFdXez0ntw98a+ncuTPsdrvXfQ7r16/HrFmzUFRU5DcJqKqqQmZmJoqLiwEgaKy7nG1+/PFHCIKAmTNnYubMmQG3OXz4MNq
2024-10-19 12:54:01 +04:00
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Выбросы в датасете:\n",
" Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
2024-10-25 20:14:00 +04:00
"4 0 137 40 35 168 43.1 \n",
"12 10 139 80 0 0 27.1 \n",
"39 4 111 72 47 207 37.1 \n",
"45 0 180 66 39 0 42.0 \n",
"58 0 146 82 0 0 40.5 \n",
"100 1 163 72 0 0 39.0 \n",
"147 2 106 64 35 119 30.5 \n",
"187 1 128 98 41 58 32.0 \n",
"218 5 85 74 22 0 29.0 \n",
"228 4 197 70 39 744 36.7 \n",
"243 6 119 50 22 176 27.1 \n",
"245 9 184 85 15 0 30.0 \n",
"259 11 155 76 28 150 33.3 \n",
"292 2 128 78 37 182 43.3 \n",
"308 0 128 68 19 180 30.5 \n",
"330 8 118 72 19 0 23.1 \n",
"370 3 173 82 48 465 38.4 \n",
"371 0 118 64 23 89 0.0 \n",
"383 1 90 62 18 59 25.1 \n",
"395 2 127 58 24 275 27.7 \n",
"445 0 180 78 63 14 59.4 \n",
"534 1 77 56 30 56 33.3 \n",
"593 2 82 52 22 115 28.5 \n",
"606 1 181 78 42 293 40.0 \n",
"618 9 112 82 24 0 28.2 \n",
"621 2 92 76 20 0 24.2 \n",
"622 6 183 94 0 0 40.8 \n",
"659 3 80 82 31 70 34.2 \n",
"661 1 199 76 43 0 42.9 \n",
2024-10-19 12:54:01 +04:00
"\n",
" DiabetesPedigreeFunction Age Outcome \n",
2024-10-25 20:14:00 +04:00
"4 2.288 33 1 \n",
"12 1.441 57 0 \n",
"39 1.390 56 1 \n",
"45 1.893 25 1 \n",
"58 1.781 44 0 \n",
"100 1.222 33 1 \n",
"147 1.400 34 0 \n",
"187 1.321 33 1 \n",
"218 1.224 32 1 \n",
"228 2.329 31 0 \n",
"243 1.318 33 1 \n",
"245 1.213 49 1 \n",
"259 1.353 51 1 \n",
"292 1.224 31 1 \n",
"308 1.391 25 1 \n",
"330 1.476 46 0 \n",
"370 2.137 25 1 \n",
"371 1.731 21 0 \n",
"383 1.268 25 0 \n",
"395 1.600 25 0 \n",
"445 2.420 25 1 \n",
"534 1.251 24 0 \n",
"593 1.699 25 0 \n",
"606 1.258 22 1 \n",
"618 1.282 50 1 \n",
"621 1.698 28 0 \n",
"622 1.461 45 0 \n",
"659 1.292 27 1 \n",
"661 1.394 22 1 \n"
2024-10-19 12:54:01 +04:00
]
},
{
"data": {
2024-10-25 20:14:00 +04:00
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA0kAAAIjCAYAAADWYVDIAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAC5fUlEQVR4nOzdeXwU5eEG8Gc3dwK5OJKgGAKegILgQUBAKxbUQr1K64k3UtoqntAWEa3Gs+qvRRRUsCLiUS8spSqgeAS0IFYEFGNA1AQkIQkk5CA7vz/CrrubmZ3rnWv3+X4+fJQ93nmvmd2XnXnGJ0mSBCIiIiIiIgIA+J2uABERERERkZtwkURERERERBSGiyQiIiIiIqIwXCQRERERERGF4SKJiIiIiIgoDBdJREREREREYbhIIiIiIiIiCsNFEhERERERURgukoiIiIiIiMJwkURERERERBSGiyQissTLL78Mn88n+2fgwIFOV4+IiIhIUbLTFSCi+PbHP/4RxxxzTOjvd999t4O1ISIiIlLHRRIRWeqMM87AqaeeGvr7k08+id27dztXISIiIiIVPN2OiCzR2toKAPD71Q8zCxcuhM/nw7Zt20KPBQIBHHfccfD5fFi4cGHo8f/973+4/PLL0bdvX6Snp6OwsBBXXnklampqIsq84447ZE/1S07+6d+GTj31VAwcOBDr1q3D8OHDkZGRgZKSEjz++OOd2nL77bdj6NChyMnJQVZWFkaOHIlVq1ZFvG7btm2h7bz22msRzzU3NyMvLw8+nw8PPvhgp3r27NkTbW1tEe95/vnnQ+WFLyxff/11nH322ejVqxfS0tLQr18/3HXXXWhvb1ft6+D2tmzZgokTJyI7OxvdunXD9ddfj+bm5ojXLliwAD/72c/Qs2dPpKWloX///pg7d65suf/+978xevRodO3aFdnZ2TjxxBOxePHiiNesXbsWZ511FvLy8pCVlYXjjjsOjz76aMRrtmzZggsuuAD5+flIT0/HCSecgDfeeCPiNXrmy+WXXx4x/nl5eTj11FPx/vvvR5SptU+Dcybagw8+2KlOffr0weWXXx7xupdeegk+nw99+vSJeHzXrl246qqrcNhhhyEpKSlU3y5dunTaVrQ+ffoontrq8/kiXnvgwAHcdddd6NevH9LS0tCnTx/88Y9/REtLS6dytYxp+JyPtd1AIIBHHnkEAwYMQHp6OgoKCjB58mTs2bNHU/ui+/Hdd9+Fz+fDu+++G3rs1FNPjfgHGQD45JNPZOsDAIsWLcJJJ52EzMxM5OXlYdSoUXjrrbdC24zVp8HxC7Y/fM7t3bsXQ4cORUlJCaqqqhRfBwBTp06Fz+fr1D4ich5/SSIiSwQXSWlpaYbe/+yzz+Lzzz/v9Pjbb7+Nb775BldccQUKCwvxxRdfYN68efjiiy+wZs2aTl+G5s6dG/FFM3rRtmfPHpx11lmYOHEiLrzwQrz44ouYMmUKUlNTceWVVwIAGhoa8OSTT+LCCy/ENddcg7179+Kpp57C2LFj8fHHH2Pw4MERZaanp2PBggU455xzQo+98sornRYh4fbu3Ys333wT5557buixBQsWID09vdP7Fi5ciC5duuDGG29Ely5dsHLlStx+++1oaGjAAw88oLiNcBMnTkSfPn1QVlaGNWvW4P/+7/+wZ88e/OMf/4jouwEDBmDChAlITk7G0qVL8dvf/haBQABTp06NqM+VV16JAQMGYMaMGcjNzcWnn36K5cuX46KLLgLQMW6/+MUvUFRUhOuvvx6FhYXYvHkz3nzzTVx//fUAgC+++AIjRozAIYccgunTpyMrKwsvvvgizjnnHPzzn/+M6JtoSvMFALp3746HH34YAPDdd9/h0UcfxVlnnYUdO3YgNzdXWJ+qOXDgAP70pz/JPjdp0iS88847+P3vf49BgwYhKSkJ8+bNw/r16zWVPXjwYNx0000Rj/3jH//A22+/HfHY1VdfjWeeeQYXXHABbrrpJqxduxZlZWXYvHkzXn311dDrtIxpuGuvvRYjR44E0DHXw8sCgMmTJ2PhwoW44oor8Ic//AGVlZX4+9//jk8//RQffvghUlJSNLVTr9tuu0328dmzZ+OOO+7A8OHDceeddyI1NRVr167FypUr8fOf/xyPPPII9u3bBwDYvHkz7rnnnohTh5UWr21tbTj//PPx7bff4sMPP0RRUZFi3b7++mvMnz/fZAuJyDISEZEFHnnkEQmA9Nlnn0U8Pnr0aGnAgAERjy1YsEACIFVWVkqSJEnNzc3SYYcdJp155pkSAGnBggWh1zY1NXXa1vPPPy8BkFavXh16bNasWRIA6ccff1Ss4+jRoyUA0kMPPRR6rKWlRRo8eLDUs2dPqbW1VZIkSTpw4IDU0tIS8d49e/ZIBQUF0pVXXhl6rLKyUgIgXXjhhVJycrJUXV0deu7000+XLrroIgmA9MADD3Sq54UXXij94he/CD2+fft2ye/3SxdeeGGndsj1weTJk6XMzEypublZsb3h25swYULE47/97W87jZfcdsaOHSv17ds39Pe6ujqpa9eu0sknnyzt378/4rWBQECSpI7+KykpkYqLi6U9e/bIvkaSOvro2GOPjWhDIBCQhg8fLh1xxBGhx/TMl0mTJknFxcUR25w3b54EQPr4449jtlWuT+XmryRJ0gMPPBBRJ0mSpOLiYmnSpEmhvz/22GNSWlqadNppp0XUaf/+/ZLf75cmT54cUeakSZOkrKysTtuKVlxcLJ199tmdHp86daoU/jG/YcMGCYB09dVXR7zu5ptvlgBIK1eulCRJ25gGbd26VQIgPfPMM6HHgnMs6P3335cASM8991zEe5cvXy77eLSSkhLpsssui3hs1apVEgBp1apVocdGjx4tjR49OvT3ZcuWSQCkcePGRdRn69atkt/vl84991ypvb09ZvuUthUU3OcXLFggBQIB6eKLL5YyMzOltWvXKr4uaOLEidLAgQOl3r17R8wTInIHnm5HRJYInv7Wo0cP3e+dM2cOampqMGvWrE7PZWRkhP6/ubkZu3fvxrBhwwBA87+6h0tOTsbkyZNDf09NTcXkyZOxa9curFu3DgCQlJSE1NRUAB2nDdXW1uLAgQM44YQTZLc5ZMgQDBgwAM8++ywAYPv27Vi1alXMU2quvPJKLF++HNXV1QCAZ555BqWlpTjyyCM7vTa8D/bu3Yvdu3dj5MiRaGpqwpYtWzS1O/yXIAD4/e9/DwBYtmyZ7Hbq6+uxe/dujB49Gt988w3q6+sBdPxCtHfvXkyfPh3p6ekRZQZ/1fv0009RWVmJG264IfTLTfRramtrsXLlSkycODHUpt27d6OmpgZjx47F1q1b8f3338u2JdZ8ATrGLFjehg0b8I9//ANFRUURgSJ6+rS9vT1UXvBPU1OT7LaDmpqacOedd+J3v/sdDjvssIjnGhsbEQgE0K1bt5hlmBUc2xtvvDHi8eAvUP/6178AaBvTIC2/GL/00kvIycnBGWecEdFnQ4cORZcuXTqdthqtZ8+e+O677zS08CeSJGHGjBk4//zzcfLJJ0c899prryEQCOD222/v9Muy3Gl5Wt1yyy147rnn8OKLL+Kkk06K+dp169bhpZdeQllZmaZTkonIftwzicgS27dvR3Jysu5FUn19Pe655x7ceOONKCgo6PR8bW0trr/+ehQUFCAjIwM9evRASUlJ6L169erVC1lZWRGPBRcm4deXPPPMMzjuuOOQnp6Obt26oUePHvjXv/6luM0rrrgCCxYsANBx6tLw4cNxxBFHKNZj8ODBGDhwIP7xj39AkqTQqUlyvvjiC5x77rnIyclBdnY2evTogUsuuQSA9j6Irku/fv3g9/sj2vzhhx9izJgxyMrKQm5uLnr06IE//vGPEdupqKgAgJix7lpe8/XXX0OSJMycORM9evSI+BNc/OzatavT+9TmCwDs2LEjVNbxxx+PiooK/POf/4w4ZUpPn27ZskWxjkr++te/orm5OdR/4bp164YjjjgCTz75JN566y3s2rULu3fvlr1OyIzt27fD7/fj8MMPj3i8sLAQubm52L5
2024-10-19 12:54:01 +04:00
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.figure(figsize=(10, 6))\n",
"plt.scatter(df['DiabetesPedigreeFunction'], df['Age'])\n",
"plt.xlabel('Функция родословной диабета')\n",
"plt.ylabel('Возраст')\n",
"plt.title('Диаграмма рассеивания перед чисткой')\n",
"plt.show()\n",
"\n",
"Q1 = df[\"DiabetesPedigreeFunction\"].quantile(0.25)\n",
"Q3 = df[\"DiabetesPedigreeFunction\"].quantile(0.75)\n",
"\n",
"IQR = Q3 - Q1\n",
"\n",
"threshold = 1.5 * IQR\n",
"lower_bound = Q1 - threshold\n",
"upper_bound = Q3 + threshold\n",
"\n",
"outliers = (df[\"DiabetesPedigreeFunction\"] < lower_bound) | (df[\"DiabetesPedigreeFunction\"] > upper_bound)\n",
"\n",
"# Вывод выбросов\n",
"print(\"Выбросы в датасете:\")\n",
"print(df[outliers])\n",
"\n",
"# Заменяем выбросы на медианные значения\n",
"median_score = df[\"DiabetesPedigreeFunction\"].median()\n",
"df.loc[outliers, \"DiabetesPedigreeFunction\"] = median_score\n",
"\n",
"# Визуализация данных после обработки\n",
"plt.figure(figsize=(10, 6))\n",
"plt.scatter(df['DiabetesPedigreeFunction'], df['Age'])\n",
"plt.xlabel('Функция родословной диабета')\n",
"plt.ylabel('Возраст')\n",
"plt.title('Диаграмма рассеивания после чистки')\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Разбиение набора данных на обучающую, контрольную и тестовую выборки"
]
},
{
"cell_type": "code",
2024-10-25 20:15:35 +04:00
"execution_count": 8,
2024-10-19 12:54:01 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Размер обучающей выборки: 460\n",
"Размер контрольной выборки: 154\n",
"Размер тестовой выборки: 154\n"
]
}
],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)\n",
"\n",
"train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)\n",
"\n",
"print(\"Размер обучающей выборки:\", len(train_df))\n",
"print(\"Размер контрольной выборки:\", len(val_df))\n",
"print(\"Размер тестовой выборки:\", len(test_df))"
]
},
{
"cell_type": "code",
2024-10-25 20:15:35 +04:00
"execution_count": 9,
2024-10-19 12:54:01 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение функции родословной диабета в обучающей выборке:\n",
"DiabetesPedigreeFunction\n",
2024-10-25 20:14:00 +04:00
"0.3725 12\n",
"0.2580 5\n",
"0.2990 4\n",
"0.1970 4\n",
"0.3040 4\n",
" ..\n",
"0.6610 1\n",
"0.3980 1\n",
"0.2860 1\n",
"0.6950 1\n",
"0.3050 1\n",
"Name: count, Length: 341, dtype: int64\n",
2024-10-19 12:54:01 +04:00
"\n",
"Распределение функции родословной диабета в контрольной выборке:\n",
"DiabetesPedigreeFunction\n",
2024-10-25 20:14:00 +04:00
"0.3725 10\n",
"0.1420 3\n",
"0.2540 3\n",
"0.4440 2\n",
"0.3490 2\n",
" ..\n",
"0.2450 1\n",
"0.6930 1\n",
"0.1210 1\n",
"0.6860 1\n",
"0.1220 1\n",
"Name: count, Length: 132, dtype: int64\n",
2024-10-19 12:54:01 +04:00
"\n",
"Распределение функции родословной диабета в тестовой выборке:\n",
"DiabetesPedigreeFunction\n",
"0.3725 7\n",
2024-10-25 20:14:00 +04:00
"0.5280 2\n",
2024-10-19 12:54:01 +04:00
"0.1480 2\n",
2024-10-25 20:14:00 +04:00
"0.2590 2\n",
2024-10-19 12:54:01 +04:00
"0.4430 2\n",
" ..\n",
"0.2480 1\n",
"0.2360 1\n",
"0.3020 1\n",
"0.4850 1\n",
"0.7050 1\n",
2024-10-25 20:14:00 +04:00
"Name: count, Length: 135, dtype: int64\n",
2024-10-19 12:54:01 +04:00
"\n"
]
}
],
"source": [
"def check_balance(df, name):\n",
" counts = df['DiabetesPedigreeFunction'].value_counts()\n",
" print(f\"Распределение функции родословной диабета в {name}:\")\n",
" print(counts)\n",
" print()\n",
"\n",
"check_balance(train_df, \"обучающей выборке\")\n",
"check_balance(val_df, \"контрольной выборке\")\n",
"check_balance(test_df, \"тестовой выборке\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
2024-10-25 20:14:00 +04:00
"В датасете нет категориальных данных, hot-encoding не требуется\n",
"\n",
"Используем дискретизацию"
2024-10-19 12:54:01 +04:00
]
},
{
"cell_type": "code",
2024-10-25 20:15:35 +04:00
"execution_count": 10,
2024-10-19 12:54:01 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-10-25 20:14:00 +04:00
"Обучающая выборка:\n",
" Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
"335 0 165 76 43 255 47.9 \n",
"467 0 97 64 36 100 36.8 \n",
"51 1 101 50 15 36 24.2 \n",
"131 9 122 56 0 0 33.3 \n",
"649 0 107 60 25 0 26.4 \n",
2024-10-19 12:54:01 +04:00
"\n",
2024-10-25 20:14:00 +04:00
" DiabetesPedigreeFunction Age Outcome Age_Category \n",
"335 0.259 26 0 young \n",
"467 0.600 25 0 young \n",
"51 0.526 26 0 young \n",
"131 1.114 33 1 young \n",
"649 0.133 23 0 young \n",
2024-10-19 12:54:01 +04:00
"\n",
2024-10-25 20:14:00 +04:00
"Контрольная выборка:\n",
" Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
"370 3 173 82 48 465 38.4 \n",
"53 8 176 90 34 300 33.7 \n",
"644 3 103 72 30 152 27.6 \n",
"71 5 139 64 35 140 28.6 \n",
"675 6 195 70 0 0 30.9 \n",
2024-10-19 12:54:01 +04:00
"\n",
2024-10-25 20:14:00 +04:00
" DiabetesPedigreeFunction Age Outcome Age_Category \n",
"370 0.3725 25 1 young \n",
"53 0.4670 58 1 middle-aged \n",
"644 0.7300 27 0 young \n",
"71 0.4110 26 0 young \n",
"675 0.3280 31 1 young \n",
2024-10-19 12:54:01 +04:00
"\n",
2024-10-25 20:14:00 +04:00
"Тестовая выборка:\n",
" Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
"668 6 98 58 33 190 34.0 \n",
"324 2 112 75 32 0 35.7 \n",
"624 2 108 64 0 0 30.8 \n",
"690 8 107 80 0 0 24.6 \n",
"473 7 136 90 0 0 29.9 \n",
2024-10-19 12:54:01 +04:00
"\n",
2024-10-25 20:14:00 +04:00
" DiabetesPedigreeFunction Age Outcome Age_Category \n",
"668 0.430 43 0 middle-aged \n",
"324 0.148 21 0 young \n",
"624 0.158 21 0 young \n",
"690 0.856 34 0 middle-aged \n",
"473 0.210 50 0 middle-aged \n"
]
}
],
"source": [
"list_age = [\"young\", \"middle-aged\", \"old\"]\n",
"bins = np.linspace(0, 100, 4)\n",
"train_df['Age_Category'] = pd.cut(train_df['Age'], bins=bins, labels=list_age)\n",
"val_df['Age_Category'] = pd.cut(val_df['Age'], bins=bins, labels=list_age)\n",
"test_df['Age_Category'] = pd.cut(test_df['Age'], bins=bins, labels=list_age)\n",
"print(\"Обучающая выборка:\\n\",train_df.head(), \"\\n\")\n",
"print(\"Контрольная выборка:\\n\",val_df.head(), \"\\n\")\n",
"print(\"Тестовая выборка:\\n\",test_df.head())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Используем Ручной синтез, создадим новый столбец который будет говорить об опасности диабета при уровне сахара >=11"
]
},
{
"cell_type": "code",
2024-10-25 20:15:35 +04:00
"execution_count": 11,
2024-10-25 20:14:00 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Обучающая выборка:\n",
" Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
"335 0 165 76 43 255 47.9 \n",
"467 0 97 64 36 100 36.8 \n",
"51 1 101 50 15 36 24.2 \n",
"131 9 122 56 0 0 33.3 \n",
"649 0 107 60 25 0 26.4 \n",
"\n",
" DiabetesPedigreeFunction Age Outcome Age_Category Glucose_Warning \n",
"335 0.259 26 0 young 0 \n",
"467 0.600 25 0 young 0 \n",
"51 0.526 26 0 young 0 \n",
"131 1.114 33 1 young 0 \n",
"649 0.133 23 0 young 0 \n",
"\n",
"Контрольная выборка:\n",
" Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
"370 3 173 82 48 465 38.4 \n",
"53 8 176 90 34 300 33.7 \n",
"644 3 103 72 30 152 27.6 \n",
"71 5 139 64 35 140 28.6 \n",
"675 6 195 70 0 0 30.9 \n",
"\n",
" DiabetesPedigreeFunction Age Outcome Age_Category Glucose_Warning \n",
"370 0.3725 25 1 young 0 \n",
"53 0.4670 58 1 middle-aged 0 \n",
"644 0.7300 27 0 young 0 \n",
"71 0.4110 26 0 young 0 \n",
"675 0.3280 31 1 young 0 \n",
"\n",
"Тестовая выборка:\n",
" Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
"668 6 98 58 33 190 34.0 \n",
"324 2 112 75 32 0 35.7 \n",
"624 2 108 64 0 0 30.8 \n",
"690 8 107 80 0 0 24.6 \n",
"473 7 136 90 0 0 29.9 \n",
"\n",
" DiabetesPedigreeFunction Age Outcome Age_Category Glucose_Warning \n",
"668 0.430 43 0 middle-aged 0 \n",
"324 0.148 21 0 young 0 \n",
"624 0.158 21 0 young 0 \n",
"690 0.856 34 0 middle-aged 0 \n",
"473 0.210 50 0 middle-aged 0 \n"
]
}
],
"source": [
"train_df['Glucose_Warning'] = [1 if i >= 200 else 0 for i in train_df[\"Glucose\"]]\n",
"val_df['Glucose_Warning'] = [1 if i >= 200 else 0 for i in val_df[\"Glucose\"]]\n",
"test_df['Glucose_Warning'] =[1 if i >= 200 else 0 for i in test_df[\"Glucose\"]]\n",
"print(\"Обучающая выборка:\\n\",train_df.head(), \"\\n\")\n",
"print(\"Контрольная выборка:\\n\",val_df.head(), \"\\n\")\n",
"print(\"Тестовая выборка:\\n\",test_df.head())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Используем масштабирование"
]
},
{
"cell_type": "code",
2024-10-25 20:15:35 +04:00
"execution_count": 12,
2024-10-25 20:14:00 +04:00
"metadata": {},
"outputs": [],
"source": [
"from sklearn.preprocessing import StandardScaler, MinMaxScaler\n",
"\n",
"# Пример масштабирования числовых признаков\n",
"numerical_features = ['Glucose', 'BloodPressure', 'DiabetesPedigreeFunction', 'Age']\n",
"\n",
"scaler = StandardScaler()\n",
"train_df[numerical_features] = scaler.fit_transform(train_df[numerical_features])\n",
"val_df[numerical_features] = scaler.transform(val_df[numerical_features])\n",
"test_df[numerical_features] = scaler.transform(test_df[numerical_features])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Используем Featuretools"
]
},
{
"cell_type": "code",
2024-10-25 20:15:35 +04:00
"execution_count": 13,
2024-10-25 20:14:00 +04:00
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\5_semester\\AIM\\rep\\AIM-PIbd-31-Razubaev-S-M\\.venv\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n",
" warnings.warn(\n",
"d:\\5_semester\\AIM\\rep\\AIM-PIbd-31-Razubaev-S-M\\.venv\\Lib\\site-packages\\featuretools\\synthesis\\dfs.py:321: UnusedPrimitiveWarning: Some specified primitives were not used during DFS:\n",
" agg_primitives: ['count', 'mean', 'sum']\n",
"This may be caused by a using a value of max_depth that is too small, not setting interesting values, or it may indicate no compatible columns for the primitive were found in the data. If the DFS call contained multiple instances of a primitive in the list above, none of them were used.\n",
" warnings.warn(warning_msg, UnusedPrimitiveWarning)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Built 9 features\n",
"Elapsed: 00:00 | Progress: 100%|██████████\n",
" Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
"id \n",
"0 6 148 72 35 0 33.6 \n",
"1 1 85 66 29 0 26.6 \n",
"2 8 183 64 0 0 23.3 \n",
"3 1 89 66 23 94 28.1 \n",
"4 0 137 40 35 168 43.1 \n",
"\n",
" DiabetesPedigreeFunction Age Outcome \n",
"id \n",
"0 0.6270 50 1 \n",
"1 0.3510 31 0 \n",
"2 0.6720 32 1 \n",
"3 0.1670 21 0 \n",
"4 0.3725 33 1 \n"
]
}
],
"source": [
"import featuretools as ft\n",
"df['id'] = df.index\n",
"es = ft.EntitySet(id='diabet_data')\n",
"es = es.add_dataframe(dataframe_name='diabet', dataframe=df, index='id', make_index=False)\n",
"features, feature_defs = ft.dfs(entityset=es,\n",
" target_dataframe_name='diabet',\n",
" agg_primitives=['count', 'mean', 'sum'],\n",
" verbose=True)\n",
"val_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=val_df.index)\n",
"test_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=test_df.index)\n",
"print(features.head())\n"
]
},
{
"cell_type": "code",
2024-10-25 20:15:35 +04:00
"execution_count": 14,
2024-10-25 20:14:00 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-10-25 20:15:35 +04:00
"Время обучения модели: 0.00 секунд\n",
2024-10-25 20:14:00 +04:00
"Среднеквадратичная ошибка: 0.17\n"
]
}
],
"source": [
"import time\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.metrics import mean_squared_error\n",
"\n",
"# Разделение данных на обучающую и валидационную выборки. Удаляем целевую переменную\n",
"X = features.drop('Outcome', axis=1)\n",
"y = features['Outcome']\n",
"\n",
"# One-hot encoding для категориальных переменных (преобразование категориальных объектов в числовые)\n",
"X = pd.get_dummies(X, drop_first=True)\n",
"\n",
"# Проверяем, есть ли пропущенные значения, и заполняем их медианой или другим подходящим значением\n",
"X.fillna(X.median(), inplace=True)\n",
"\n",
"X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)\n",
"\n",
"# Обучение модели\n",
"model = LinearRegression()\n",
"\n",
"# Начинаем отсчет времени\n",
"start_time = time.time()\n",
"model.fit(X_train, y_train)\n",
"\n",
"# Время обучения модели\n",
"train_time = time.time() - start_time\n",
"\n",
"# Предсказания и оценка модели и вычисляем среднеквадратичную ошибку\n",
"predictions = model.predict(X_val)\n",
"mse = mean_squared_error(y_val, predictions)\n",
"\n",
"print(f'Время обучения модели: {train_time:.2f} секунд')\n",
"print(f'Среднеквадратичная ошибка: {mse:.2f}')"
]
},
{
"cell_type": "code",
2024-10-25 20:15:35 +04:00
"execution_count": 15,
2024-10-25 20:14:00 +04:00
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\5_semester\\AIM\\rep\\AIM-PIbd-31-Razubaev-S-M\\.venv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RMSE: 0.41022087810247887\n",
"R²: 0.26704202020201995\n",
"MAE: 0.3137012987012987 \n",
"\n",
"Кросс-валидация RMSE: 0.40125444539346944 \n",
"\n",
"Train RMSE: 0.14832451876358646\n",
"Train R²: 0.9028956552281269\n",
"Train MAE: 0.11144951140065147\n",
2024-10-19 12:54:01 +04:00
"\n"
]
2024-10-25 20:14:00 +04:00
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\5_semester\\AIM\\rep\\AIM-PIbd-31-Razubaev-S-M\\.venv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
" warnings.warn(\n"
]
2024-10-19 12:54:01 +04:00
}
],
"source": [
2024-10-25 20:14:00 +04:00
"from sklearn.ensemble import RandomForestRegressor\n",
"from sklearn.metrics import r2_score, mean_absolute_error\n",
"from sklearn.model_selection import cross_val_score\n",
2024-10-19 12:54:01 +04:00
"\n",
"\n",
2024-10-25 20:14:00 +04:00
"# Удаление строк с NaN\n",
"feature_matrix = features.dropna()\n",
"val_feature_matrix = val_feature_matrix.dropna()\n",
"test_feature_matrix = test_feature_matrix.dropna()\n",
2024-10-19 12:54:01 +04:00
"\n",
2024-10-25 20:14:00 +04:00
"# Разделение данных на обучающую и тестовую выборки\n",
"X_train = feature_matrix.drop('Outcome', axis=1)\n",
"y_train = feature_matrix['Outcome']\n",
"X_val = val_feature_matrix.drop('Outcome', axis=1)\n",
"y_val = val_feature_matrix['Outcome']\n",
"X_test = test_feature_matrix.drop('Outcome', axis=1)\n",
"y_test = test_feature_matrix['Outcome']\n",
2024-10-19 12:54:01 +04:00
"\n",
2024-10-25 20:14:00 +04:00
"X_test = X_test.reindex(columns=X_train.columns, fill_value=0) \n",
"\n",
"# Кодирования категориальных переменных с использованием одноразового кодирования\n",
"X = pd.get_dummies(X, drop_first=True)\n",
"\n",
"# Разобьём тренировочный тест и примерку модели\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
"\n",
"# Выбор модели\n",
"model = RandomForestRegressor(random_state=42)\n",
"\n",
"# Обучение модели\n",
"model.fit(X_train, y_train)\n",
"\n",
"# Предсказание и оценка\n",
"y_pred = model.predict(X_test)\n",
"\n",
"rmse = mean_squared_error(y_test, y_pred, squared=False)\n",
"r2 = r2_score(y_test, y_pred)\n",
"mae = mean_absolute_error(y_test, y_pred)\n",
"\n",
"print()\n",
"print(f\"RMSE: {rmse}\")\n",
"print(f\"R²: {r2}\")\n",
"print(f\"MAE: {mae} \\n\")\n",
"\n",
"# Кросс-валидация\n",
"scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')\n",
"rmse_cv = (-scores.mean())**0.5\n",
"print(f\"Кросс-валидация RMSE: {rmse_cv} \\n\")\n",
"\n",
"# Анализ важности признаков\n",
"feature_importances = model.feature_importances_\n",
"feature_names = X_train.columns\n",
"\n",
"# Проверка на переобучение\n",
"y_train_pred = model.predict(X_train)\n",
"\n",
"rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)\n",
"r2_train = r2_score(y_train, y_train_pred)\n",
"mae_train = mean_absolute_error(y_train, y_train_pred)\n",
"\n",
"print(f\"Train RMSE: {rmse_train}\")\n",
"print(f\"Train R²: {r2_train}\")\n",
"print(f\"Train MAE: {mae_train}\")\n",
"print()"
2024-10-19 12:54:01 +04:00
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}