379 lines
926 KiB
Plaintext
Raw Normal View History

2024-12-11 15:35:02 +04:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Бизнес-цель\n",
"Анализ ключевых факторов, влияющих на диабет. Предсказание вероятности развития диабета на основе медданных. Актуальность для планирвоания лечения.\n",
"1. Уровень давления(BloodPressure) и возраст(Age) - с возрастом артериальное давление может увеличиться, что является фактором риска для диабета.\n",
"2. Уровень инсулина(Insulin) и уровень глюкозы(Glucose) - уровень инсулина напрямую влияет на уровень сахара в крови.\n",
"3. Индекс массы тела(BMI) и возраст(Age) - с повышением возраста зачастую увеличивается индекс массы тела.\n",
"4. Уровень глюкозы(Glucose) и индекс массы тела(BMI) - как индекс массы тела влияет на уровень глюкозы."
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
"0 6 148 72 35 0 33.6 \n",
"1 1 85 66 29 0 26.6 \n",
"2 8 183 64 0 0 23.3 \n",
"3 1 89 66 23 94 28.1 \n",
"4 0 137 40 35 168 43.1 \n",
"\n",
" DiabetesPedigreeFunction Age Outcome \n",
"0 0.627 50 1 \n",
"1 0.351 31 0 \n",
"2 0.672 32 1 \n",
"3 0.167 21 0 \n",
"4 2.288 33 1 \n"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from scipy.cluster.hierarchy import dendrogram, linkage, fcluster\n",
"from sklearn.cluster import KMeans\n",
"from sklearn.decomposition import PCA\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.metrics import silhouette_score\n",
"\n",
"df = pd.read_csv(\"data/diabetes.csv\")\n",
"df = df.head(1500)\n",
"print(df.head())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Очистка данных"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Glucose BloodPressure SkinThickness Insulin BMI Age Outcome\n",
"0 148 72 35 0 33.6 50 1\n",
"1 85 66 29 0 26.6 31 0\n",
"2 183 64 0 0 23.3 32 1\n",
"3 89 66 23 94 28.1 21 0\n",
"4 137 40 35 168 43.1 33 1\n"
]
}
],
"source": [
"df_cleaned = df.drop(columns=['Pregnancies', 'DiabetesPedigreeFunction'], errors='ignore').dropna()\n",
"print(df_cleaned.head())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Визуализация парных взаимодействий"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABjAAAASgCAYAAABWngGUAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3xb9b0//pekoy1LtuVtJ3HiLMgw2QmQwd75lVFomZdCyyhlj3LhFrgUCoWywiijFMooo3BZ5duyobQkJGGFTJLYGd5TsqSjcaTz+8ORYtmyLclax349H48+aM45OudzzudY47w/789bJcuyDCIiIiIiIiIiIiIiohyiznYDiIiIiIiIiIiIiIiI+mMAg4iIiIiIiIiIiIiIcg4DGERERERERERERERElHMYwCAiIiIiIiIiIiIiopzDAAYREREREREREREREeUcBjCIiIiIiIiIiIiIiCjnMIBBREREREREREREREQ5hwEMIiIiIiIiIiIiIiLKOQxgEBERERERERERERFRzmEAg4goA8455xxMmzYt6n/z58/Hueeeiy+//DKy3eGHH45f//rXGWnT3r17MW3aNLz++utR/+77v+nTp2POnDk45ZRT8Le//S0j7VKi+vp6TJs2DYsWLYLf7892c4iIiIiI4rZq1SpMmzYt48c955xzcM4550T+PW3aNKxatSptx2tubsa9996LE088EXPmzMGcOXNw8skn44knnoAoilHbprstREQUPyHbDSAiGisOPPBA3HLLLQCAYDCIrq4u/PWvf8UFF1yA119/HVOmTMlyC3tdcsklWLFiBQBAlmW43W68+uqruOmmmyBJEn7yk59kt4E56LXXXkNNTQ127dqFf/zjH1i5cmW2m0REREREpCgvv/wyysrK0rLvNWvW4PLLL4fNZsOZZ56JadOmIRQKYc2aNXjsscfw3nvv4YUXXoBer0/L8YmIKHkMYBARZYjFYsFBBx0Utezggw/GkiVL8Prrr+OGG27ITsP6GT9+fMx2btmyBc888wwDGP0Eg0G88cYbOOOMM/D111/jpZdeYgCDiIiIiChB/X+DpEpnZyeuuuoqVFdX489//jNMJlNk3SGHHIIjjjgCP/3pT/Hss8/iF7/4RVraQEREyeMUUkREWWQ0GqHX66FSqWKu7+npwe9+9zsceeSRmDVrFk488cQBUzkFg0G88MILOOmkkzB79mysWLEC9957L3w+X9R27733HlauXInZs2fj5JNPxpYtW+Jup1qtxgEHHIDGxkYA+6eb+vOf/4xjjz0WtbW1eO211wAA27Ztw0UXXYS5c+di7ty5+OUvf4k9e/ZE7e/ZZ5/Fsccei1mzZmHp0qW49dZb4XK5Iuv//e9/4/TTT8ecOXOwYMECXHLJJdixY0dkfayptl5//XVMmzYNe/fuBdCbCn/UUUfh4YcfxsKFC3HooYfC4XAAAF599VWccMIJmDlzJlasWIFVq1YhGAzGfT36+vzzz9Ha2ooVK1Zg5cqVWL9+PbZv3z5gux07duDnP/855s6di4MPPhj3338/brzxxqi0+VAohCeeeAJHHXUUZs6ciWOOOQbPPfdcUu0iIiIiIkrU66+/jgMPPBDffvstzjjjDMyaNQuHHXYY/vSnP0Vt984770R+WyxevBjXXnstWlpaIutjTcE03FRVfV+zZs0aTJs2DV988QV+9rOfoba2FocccgjuueeehL+3v/jii+jo6MBvf/vbqOBFWG1tLc4777yY64CBvzPC+v8m8fv9eOCBB3DEEUdg9uzZOPHEE/F///d/Ua959913ccopp2DOnDk45JBD8Jvf/CbyGwUAvF4vbr31VixbtgwzZ87EscceO+Dad3d34ze/+Q0OPvhgzJo1C6effjq++OKLhK4JEZGSMAODiChDZFmGJEmR/9/d3Y1nn30Wfr8fp5566oDtvV4vzjzzTHR0dODyyy9HZWUlPvjgA9x0001ob2/HxRdfDAD4zW9+gzfffBM///nPMX/+fGzatAmPPPIINm/ejKeeegoqlQofffQRLr/8cpx00km47rrrsHnzZlx33XUJtb+urg7jx4+PWrZq1SrcdNNNsFgsqK2tRV1dHX7yk59g0qRJuPvuuyFJEh577DH89Kc/xZtvvgm73Y533nkH99xzD2644QZMmzYNO3fuxN133w1RFHH33Xdjz549uPTSS3Hqqafi6quvhtPpxH333Ydf/OIXeP/996FWxx97b2xsxKeffor7778f3d3dsNlsePzxx3H//ffj7LPPxo033ojNmzdj1apVaGpqwp133pnQNQF6p4+aMmUKZs6ciZqaGtx222146aWXcPPNN0e26ezsxNlnnw273Y7f/e53CAaDePDBB9HY2Bg10uzWW2/F66+/josuughz5szB2rVrceedd8LpdOKXv/xlwm0jIiIiIkpUKBTClVdeif/6r//ClVdeib/97W/4/e9/j6lTp2Lp0qVYv349rr/+elx66aVYsGABmpubcc899+Caa67B888/n9K2XHvttTjzzDPx85//HJ988gmeeuopjBs3LqGs8A8//BDTpk0bcsreVGTDX3vttfj0009xySWXoLa2Fp9++il+/etfQ6vV4sQTT8Sjjz6Khx56CGeeeSauuuoq7NmzBw8++CC++eYbvPLKKzAYDLjzzjvx+eef44YbbkBRURE+++wz/P73v0d+fj5OPfVU+Hw+nHfeeWhvb8dVV12FkpISvPbaa7jwwgvx1FNPYcmSJSM+DyKiXMMABhFRhqxduxYzZswYsPzqq69GTU3NgOWvv/46tm3bhpdeeglz5swBACxduhSSJOHRRx/FT37yE7S3t+Nvf/sbrrnmmki68yGHHIKSkhJcf/31+Oyzz7B8+XI88sgjmD17Nu65557IfgDgD3/4w4DjhkKhSKAlFAqhpaUFzz33HLZs2YJbb701atvjjjsuKvhyzTXXwGg04plnnoHFYgEALFmyBEceeSSeeuop3HDDDfjyyy9RVVWFs846C2q1GgsXLoTJZIqMPPruu+/g9Xpx0UUXobS0FABQVlaGDz/8EB6PJ7LfeEiShBtuuAHz588H0JvR8uijj+KMM86IBBgOPfRQ5Ofn4+abb8b555+fUC2Srq4ufPTRR7j66qsB9GbUHH/88XjzzTcj1wIAnnvuObjdbrzxxhuRc6qtrcUxxxwT2VddXR1eeeUVXH311ZG+PPTQQ6FSqfD444/jzDPPREFBQdxtIyIiIiJKhizLuPTSS/HjH/8YADBv3jy8//77+OSTTyIBDIPBgF/84hfQ6XQAgPz8fGzYsAGyLA+aXZ6MH//4x5GBPEuWLMEHH3yATz75JKEAxu7du3HIIYcMWB7+zdOXICT3mGzbtm345z//if/+7//GeeedF2lvQ0MD1qxZg6VLl+Kxxx7D6aefjt/85jeR102dOhVnnXUWXnvtNZx11ln48ssvccghh+CEE04AACxatAgmkwl2ux0A8Oabb2LLli145ZVXUFtbCwBYtmwZzjnnHNx7772RrHgiotGEAQwiogyZMWMGbrvtNgC9PwqcTic+++wz3H///fB4PLjqqquitv/yyy9RWVkZCV6ErVy5En/729/w7bffoqGhAQAiX3DDTjjhBNx4441Ys2YNFi1ahI0bN+KKK66I2ua4446LGcC46aabcNNNN0Uty8vLwyWXXIIzzjgjavkBBxwQ9e/Vq1dj4cKFMBgMkR8EFosF8+fPx3/+8x8AwOLFi/Hyyy/jlFNOwZFHHonly5fjpJNOivzQqa2thV6vx2mnnYZjjz0Wy5Ytw6JFizB79uxBruzQ+rbx66+/htfrxeGHHx71g+Xwww8H0Dt1VSIBjLfeegvBYBArVqyA0+kEABx11FF49dVX8e6770aCO6tXr8acOXMiwQsAA/p29erVkGU5Ztsee+wxrF+/HkceeWSCZ09ERERElLi+31N1Oh0KCwvh8XgAAAsWLMD999+PE088EccccwyWL1+OQw89FMuXL09rO4DegU3hdsQrFAoNWCZJUszBZVu3bk2sgfusX78eAHD00UdHLQ9
"text/plain": [
"<Figure size 1600x1200 with 4 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"\n",
"sns.set(style=\"whitegrid\")\n",
"\n",
"plt.figure(figsize=(16, 12))\n",
"\n",
"# Визуализация взаимосвязи уровня давления и возраста\n",
"plt.subplot(2, 2, 1)\n",
"sns.scatterplot(x=df_cleaned['BloodPressure'], y=df_cleaned['Age'], alpha=0.6)\n",
"plt.title('BloodPressure_Age')\n",
"\n",
"# Визуализация взаимосвязи уровня инсулина и уровня глюкозы\n",
"plt.subplot(2, 2, 2)\n",
"sns.scatterplot(x=df_cleaned['Insulin'], y=df_cleaned['Glucose'], alpha=0.6)\n",
"plt.title('Insulin_Glucose')\n",
"\n",
"# Визуализация взаимосвязи индекса массы тела и возраста\n",
"plt.subplot(2, 2, 3)\n",
"sns.scatterplot(x=df_cleaned['BMI'], y=df_cleaned['Age'], alpha=0.6)\n",
"plt.title('BMI_Age')\n",
"\n",
"# Визуализация взаимосвязи уровня глюкозы и индекса массы тела\n",
"plt.subplot(2, 2, 4)\n",
"sns.scatterplot(x=df_cleaned['Glucose'], y=df_cleaned['BMI'], alpha=0.6)\n",
"plt.title('Glucose_BMI')\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Стандартизация данных для кластеризации"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"scaler = StandardScaler()\n",
"data_scaled = scaler.fit_transform(df_cleaned)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Агломеративная (иерархическая) кластеризация"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA1AAAAJxCAYAAABMnFMWAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAADQhklEQVR4nOzdeZxT1fnH8W8myewrszEMMOybyCKiUhUUELViLdRaq1K19eeGS12qVuu+1oKKola6qCDuULV1R6zaiii44I6I7DvDLCwzyST39wfNJckkmZtMZpIwn/fr5cshy70n99wk98lzznNshmEYAgAAAAC0KC3RDQAAAACAVEEABQAAAAAWEUABAAAAgEUEUAAAAABgEQEUAAAAAFhEAAUAAAAAFhFAAQAAAIBFBFAAAAAAYBEBFAAAAABYRAAFwJJrrrlG/fv3D/nfNddck+jmAfBTW1urESNGaNmyZaqtrdUFF1ygv//974luFgDsFxyJbgCA1FFaWqqZM2cG3HbRRRclqDUAwikoKNDZZ5+tU045RYZhqH///vrjH/+Y6GYBwH6BAAqAJR6PR9nZ2Ro2bFjA7enp6YlpEICILrroIp166qmqq6tTVVWV7HZ7opsEAPsFhvABsKSpqUmZmZmWHrtkyRKdccYZGjp0qA455BBdffXVqq6uNu+fP3+++vfvr3Xr1gU8b+zYsQHDAd1ud9hhg8Hb+uyzzzRp0iQNGTJEJ554ol577bWAbdfX1+vOO+/U+PHjdeCBB2rixIl6/vnnm+0/eD/r1q3TlClTdM011+jPf/6zfvSjH2nEiBG68MILtX79+oDnL1iwQKeddpqGDx+uwYMH67jjjtPcuXPN+xcvXmxud+nSpQHPfeKJJ9S/f3+NHTu2WXv+8Ic/BDy2trZWgwcPVv/+/bV48WLL+w/nueee0+TJkzVs2DANGTJEJ510kl599dVmxzjUsM1w/TNlypSAfbzyyiuaPHmyhg8frsMPP1w33HCDamtrzfsfeOAB9e/fX8OHD5fL5Qp47iWXXNJsqGhjY6PuvvtujRkzRoMHD9aJJ56oV155JeB5Y8eO1b333qs77rhDI0eO1KGHHqqrrrpKNTU1ll9/pKGr8+fPN/vUvx+2b9+ugw8+OGRf9u/fXwMGDNDIkSN18cUXa8eOHeZj+vfvrwceeCCgbb7jEsuxlKSSkhL16tVL77//fovDbYP39fLLL2vkyJGaPn26pMDzN/g//3Z/8803uuiii3TYYYfpgAMO0JFHHqnbbrtNDQ0N5mNcLpfuu+8+jRs3TkOGDNHEiRP1j3/8w9Ixl6QNGzbo8ssv1yGHHKKhQ4fqzDPP1FdffWVuf926derfv79efvllnX/++Ro6dKiOOuooPfjgg/J6vQH9EnxMLr/88oA+NQxDM2bM0JFHHqkRI0bo/PPP18aNG83HezwezZo1SxMnTtSQIUM0bNgwnXrqqfrggw8i9qPUvM+D/20Yhk499dSAz8trrrkm4NySpKeffjrk+QOgbZCBAmDJnj17VFBQ0OLjPvroI5199tk67LDDdN9996m2tlYzZszQr371Kz3//POWgzBp70WyJD388MPq1KmTpL0Xu8GBjySdd955OuOMM3TZZZfp+eef129/+1s98sgjGjNmjBoaGnTaaadp+/btuuSSS1RZWakFCxbouuuu07Zt23T++eeb2xkzZowuvPBC899lZWWSpLfeektFRUX6wx/+IK/Xq+nTp2vKlCl6+eWXlZWVpX//+9+aOnWqfvWrX+niiy9WQ0ODnnzySd1yyy0aPHiwhg4dam4zJydHCxcu1IgRI8zbXnnlFaWlNf9NKycnR//+979lGIZsNpsk6Y033pDH4wl4XDT79zd37lzddtttuvjiizVixAjV1tbqL3/5i6688koNHz5cnTt3Nh87c+ZMlZaWSpLZH5J08skn6+c//7n575tvvjlgHw899JDuv/9+nXbaabrsssu0du1azZgxQ59++qmeffbZgHPCZrNp0aJFGjNmjCRp165deueddwKOjWEYmjp1qj7++GNdcskl6t27t958801ddtllcrlc+ulPf2o+9sknn1RVVZXuvPNOVVdXa/r06Vq9erWefvpp2Wy2Fl//hRdeqFNPPVXS3ozOoEGDzPOje/fu+u6775od0+nTp6u+vl75+fkBt/vOLbfbre+//1533323br/9dk2bNi1k34QSzbH0cbvduuOOOyzvQ5IaGhp0yy236JxzztGJJ54YcN8NN9ygAw44wPz3L37xC/PvLVu26PTTT9ewYcN01113KT09Xe+++64effRRlZWV6dxzz5UkXXnllXrnnXd0wQUXaOjQoXrnnXd0zTXXyOl0tnjMq6urdeqppyorK0vXX3+9srKy9Pjjj+v000/X888/r969e5vtuemmmzRmzBg98MADWrp0qWbOnKndu3frd7/7XcjXvWTJEr388ssBtz322GN65JFHdNVVV6lnz5666667dOmll+rZZ5+VJE2bNk1PPfWUrrjiCvXv31+bN2/Wgw8+qEsvvVT//ve/lZWVFdWx9/fiiy/qk08+ifiY2tpa3XfffTHvA0D0CKAAWFJTU2MGE5FMnz5dPXv21COPPGIOGRo6dKhOOOEEzZs3T6effrrlfe7evVuSNHz4cBUVFUmS3nvvvZCPnTJliqZOnSpJOvLIIzVp0iQ9+OCDGjNmjObPn6/ly5fr6aef1vDhw83HNDU16aGHHtKpp56qwsJCSXsDg+BhitLeAHL+/Pnq1q2bJKlXr16aNGmSXnjhBf3yl7/UihUrNGnSJF133XXmc4YPH65DDz1UixcvDghgRo8erbfeesu8iNu0aZM++eQTHXzwwc2yWqNGjdI777yjzz77zGzXq6++qpEjRwZkPaLZv7+1a9fqN7/5TUDQWFlZqcmTJ2vp0qU64YQTzNsHDhyorl27NttG586dA45Zbm6u+Xdtba0efvhhnXLKKbrhhhvM2/v166fTTz+92TnhOza+AGrhwoUqLS0NyBq8//77eu+993Tvvffqxz/+saS9/blnzx5NmzZNEydOlMOx9+stLS1Njz76qPLy8iTt7d+pU6fqvffe0+jRoy29/u7du0vaO1w13Pnh8/nnn+vFF1/UwIEDVVdXF3Cf/3NHjhyp999/X19++WXYbQWL9lj6zJkzR7t371ZJSYnlff3rX/+S0+nUOeec02zoX58+fcIeg+XLl2vgwIGaMWOGeR786Ec/0n//+18tXrxY5557rpYvX67XX39d1157rc4880xJe8/z9evXa/HixZo4cWLEY37vvfeqpqZGTz31lCorKyXtPW9+/OMfa8aMGbr//vvNxx5wwAFmgDp69Gjt3r1bjz/+uC644IKA81SSvF6vbrvtNh1wwAEB/bJ7925deOGFOuussyTtzW7dcsstqqurU35+vrZs2aLLLrssIOuakZGhiy++WN9++23E8yWSXbt2adq0ac3aE+z+++9Xly5dArKZANoWQ/gAWLJlyxaVl5dHfMyePXv02WefacyYMTIMQ01NTWpqalK3bt3Uu3dv/fe//w14vNfrNR/T1NTUbHubNm1SWlpaswudUCZNmmT+bbPZdMwxx2jZsmVqaGjQhx9+qMrKSjN48vnJT36ixsZGffbZZy1u/6CDDjKDJ0kaNGiQunXrpo8++kiSdM455+iuu+7Srl279MUXX+iVV17RI488IknNhqSNHTtWq1at0sqVKyVJr732moYOHWpeDPrLy8vTIYccorfeekuSVF1drcWLFwcENtHu398111yjK6+8UnV1dfr000/14osvmsP+Ij3Pqk8//VQul0sTJ04MuP3ggw9WZWWlPvzww4Dbx40bp4ULF8owDEl7M3O+IMln0aJFstlsGjNmTMD5M3bsWG3dujUgKzR27FgzePL92+FwmP0Wz9dvGIZuu+02nXzyyRowYEDI+5uamuRyubRs2TItXbpUgwcPDnhM8HvCP3CM9lhK0rZt2/Tggw/q6quvVkZGhqXXsXnzZv3lL3/RaaedFvW8qSOOOEJPPPGEMjIytGLFCr311lt
"text/plain": [
"<Figure size 1000x700 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[ 4 15 6 15 2 15 2 14 1 17 8 6 7 1 3 13 4 6 10 2 12 8 6 2\n",
" 3 2 6 15 7 8 7 3 15 15 10 10 8 2 2 4 12 8 7 3 8 4 15 15\n",
" 2 16 15 15 15 3 12 15 3 10 8 10 16 6 15 11 6 10 2 8 15 11 2 11\n",
" 6 12 10 9 8 10 13 15 15 16 10 15 6 10 10 10 4 15 15 11 10 5 11 12\n",
" 10 15 15 4 6 15 8 15 8 11 8 12 10 2 3 1 10 8 3 5 6 8 15 15\n",
" 4 10 10 7 6 2 10 10 2 5 3 6 3 10 15 11 10 15 8 12 7 10 15 6\n",
" 12 17 10 10 7 15 12 8 3 1 6 4 15 15 15 4 12 10 12 15 6 2 11 15\n",
" 15 15 6 2 14 10 15 3 8 4 8 6 15 11 9 15 8 4 1 4 4 3 15 10\n",
" 6 13 15 4 15 2 2 3 15 8 15 15 12 15 3 6 10 4 15 10 12 4 2 4\n",
" 2 10 2 6 1 5 14 7 15 10 8 4 1 10 6 4 15 8 15 6 3 4 4 15\n",
" 15 10 6 2 12 5 8 1 12 15 8 8 15 10 2 2 10 15 11 3 12 13 10 7\n",
" 6 10 13 10 15 13 2 15 8 10 8 10 2 15 7 11 6 12 11 5 5 7 1 4\n",
" 15 10 10 2 4 2 7 12 3 11 2 7 13 2 10 6 8 10 3 11 2 2 15 10\n",
" 3 15 2 11 15 6 10 5 11 2 2 3 10 11 2 8 4 10 7 10 13 8 15 12\n",
" 14 6 3 6 11 10 9 8 8 12 15 14 15 9 8 8 10 15 8 5 2 13 10 3\n",
" 3 7 7 5 12 10 6 15 15 3 1 17 10 10 10 4 15 10 6 10 10 15 11 15\n",
" 11 15 2 2 3 10 10 6 11 15 6 11 10 2 15 4 6 7 4 10 6 10 6 15\n",
" 6 1 10 11 12 11 2 1 15 4 15 2 12 15 10 15 4 3 16 3 12 2 14 10\n",
" 15 8 15 13 12 8 15 8 4 10 11 6 2 4 15 10 2 11 15 6 10 14 10 4\n",
" 7 10 4 7 7 15 10 10 8 15 15 10 13 12 10 10 10 7 15 7 4 15 10 7\n",
" 3 10 15 10 13 4 1 12 15 7 10 10 10 2 16 7 15 15 3 12 11 10 9 10\n",
" 10 8 4 11 15 7 2 11 7 15 15 3 3 8 8 7 15 11 16 6 15 15 15 11\n",
" 11 15 15 8 10 14 10 13 8 7 12 4 4 2 2 10 10 3 3 11 12 12 15 10\n",
" 7 15 10 10 10 7 10 8 6 3 10 15 8 15 10 10 12 2 8 8 15 11 12 10\n",
" 15 6 8 4 4 15 7 8 1 15 6 15 3 14 2 12 6 15 12 3 8 14 6 15\n",
" 15 14 10 3 13 10 3 15 12 11 15 3 3 10 3 15 8 15 2 13 12 15 8 10\n",
" 15 10 15 8 8 15 6 10 15 11 15 6 8 15 4 15 15 8 5 14 11 1 3 4\n",
" 3 15 15 11 10 15 10 1 15 12 8 2 7 4 3 4 2 12 5 2 10 12 12 15\n",
" 10 12 7 6 5 8 6 11 15 4 10 6 17 11 15 15 11 4 15 6 10 4 15 1\n",
" 3 14 11 8 11 2 4 14 11 10 16 11 5 2 11 10 2 11 15 3 3 7 10 2\n",
" 10 10 3 12 8 10 11 11 8 15 2 6 4 15 7 10 11 10 11 6 4 15 15 5\n",
" 12 10 4 10 3 5 6 10 15 1 4 4 12 6 8 5 15 4 15 12 10 11 6 10]\n"
]
}
],
"source": [
"linkage_matrix = linkage(data_scaled, method='ward')\n",
"plt.figure(figsize=(10, 7))\n",
"dendrogram(linkage_matrix)\n",
"plt.title('Дендрограмма агломеративной кластеризации')\n",
"plt.xlabel('Индекс образца')\n",
"plt.ylabel('Расстояние')\n",
"plt.show()\n",
"\n",
"# Получение результатов кластеризации с заданным порогом\n",
"result = fcluster(linkage_matrix, t=10, criterion='distance')\n",
"print(result)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Визуализация распределения кластеров"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABjAAAASgCAYAAABWngGUAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3wc1dXw8d/sbC/SqnfJtlzkJiHcAWPjQOgklAChpZAHEiCElgQCb0geHkgIJCShJJQQCIQOoQdCJwZsY+MCuNuyrd779pl5/xBaey3JVdKu5PP9JB+subM7Z2dmy9wz9x7FMAwDIYQQQgghhBBCCCGEEEKIBGKKdwBCCCGEEEIIIYQQQgghhBC7kwSGEEIIIYQQQgghhBBCCCESjiQwhBBCCCGEEEIIIYQQQgiRcCSBIYQQQgghhBBCCCGEEEKIhCMJDCGEEEIIIYQQQgghhBBCJBxJYAghhBBCCCGEEEIIIYQQIuFIAkMIIYQQQgghhBBCCCGEEAlHEhhCCCGEEEIIIYQQQgghhEg4ksAQQgghhBBCCCGEEEIIIUTCkQSGEEIMgwsvvJBJkybF/H/mzJlcdNFFLFu2LLreokWLuP7664clpqqqKiZNmsQLL7wQ8/eu/y8pKaG8vJwzzjiD5557bljiGom2bdvGpEmTmDNnDqFQKN7hCCGEEEIIsc/uvvtuJk2aNOzbvfDCC7nwwgujf0+aNIm77757yLZXV1fHnXfeySmnnEJ5eTnl5eWcfvrpPPDAA/j9/ph1hzoWIYQQ+84c7wCEEOJQMWXKFG6++WYANE2jtbWVJ598kosvvpgXXniBCRMmxDnCHj/60Y9YuHAhAIZh0N3dzbPPPsuNN95IJBLh3HPPjW+ACej555+nuLiY7du388Ybb3DaaafFOyQhhBBCCCFGlKeffprs7Owhee6lS5dy5ZVXkpyczHnnncekSZPQdZ2lS5fyl7/8hf/85z/885//xGazDcn2hRBCHDhJYAghxDBxu90cdthhMcuOOOII5s2bxwsvvMDPf/7z+AS2m8LCwn7jXL9+PY888ogkMHajaRovvvgi55xzDitXruSpp56SBIYQQgghhBD7afdrkMHS0tLC1VdfzZgxY/j73/+O0+mMth155JF87Wtf49vf/jaPPvool1xyyZDEIIQQ4sDJFFJCCBFHDocDm82Goij9tnd2dvKb3/yGY489lunTp3PKKaf0mcpJ0zT++c9/cuqpp1JaWsrChQu58847CQaDMev95z//4bTTTqO0tJTTTz+d9evX73OcJpOJyZMnU1NTA+ycburvf/87J5xwAmVlZTz//PMAbNy4kUsvvZTDDz+cww8/nMsvv5zKysqY53v00Uc54YQTmD59OvPnz+dXv/oVXV1d0faPPvqIs88+m/LycmbNmsWPfvQjtmzZEm3vb6qtF154gUmTJlFVVQX0DIU/7rjjuOeee5g9ezZHHXUU7e3tADz77LOcfPLJTJs2jYULF3L33Xejado+749dLV68mIaGBhYuXMhpp53GihUr2Lx5c5/1tmzZwv/8z/9w+OGHc8QRR3DXXXdxww03xAyb13WdBx54gOOOO45p06Zx/PHH89hjjx1QXEIIIYQQQuyvF154gSlTprB69WrOOeccpk+fzjHHHMPf/va3mPVeffXV6LXF3Llzue6666ivr4+29zcF096mqtr1MUuXLmXSpEl88sknfP/736esrIwjjzySO+64Y79/tz/xxBM0Nzfzf//3fzHJi15lZWV85zvf6bcN+l5n9Nr9miQUCvHHP/6Rr33ta5SWlnLKKafwr3/9K+Yxr7/+OmeccQbl5eUceeSR/PKXv4xeowAEAgF+9atfcfTRRzNt2jROOOGEPvu+ra2NX/7ylxxxxBFMnz6ds88+m08++WS/9okQQowkMgJDCCGGiWEYRCKR6L/b2tp49NFHCYVCnHnmmX3WDwQCnHfeeTQ3N3PllVeSl5fH22+/zY033khTUxM//OEPAfjlL3/JSy+9xP/8z/8wc+ZM1q5dy7333su6det46KGHUBSFd999lyuvvJJTTz2Vn/70p6xbt46f/vSn+xV/RUUFhYWFMcvuvvtubrzxRtxuN2VlZVRUVHDuuecybtw4br/9diKRCH/5y1/49re/zUsvvURaWhqvvvoqd9xxBz//+c+ZNGkSW7du5fbbb8fv93P77bdTWVnJZZddxplnnsk111xDR0cHf/jDH7jkkkt46623MJn2PfdeU1PDBx98wF133UVbWxvJycncf//93HXXXVxwwQXccMMNrFu3jrvvvpva2lpuu+22/don0DN91IQJE5g2bRrFxcX8+te/5qmnnuKmm26KrtPS0sIFF1xAWloav/nNb9A0jT/96U/U1NTE3Gn2q1/9ihdeeIFLL72U8vJyPv30U2677TY6Ojq4/PLL9zs2IYQQQggh9peu61x11VV897vf5aqrruK5557jd7/7HRMnTmT+/PmsWLGCn/3sZ1x22WXMmjWLuro67rjjDq699loef/zxQY3luuuu47zzzuN//ud/eP/993nooYcoKCjYr1Hh77zzDpMmTdrjlL2DMRr+uuuu44MPPuBHP/oRZWVlfPDBB1x//fVYLBZOOeUU7rvvPv785z9z3nnncfXVV1NZWcmf/vQnVq1axTPPPIPdbue2225j8eLF/PznPyc9PZ0PP/yQ3/3ud3i9Xs4880yCwSDf+c53aGpq4uqrryYzM5Pnn3+eH/zgBzz00EPMmzfvoF+HEEIkGklgCCHEMPn000+ZOnVqn+XXXHMNxcXFfZa/8MILbNy4kaeeeory8nIA5s+fTyQS4b777uPcc8+lqamJ5557jmuvvTY63PnII48kMzOTn/3sZ3z44YcsWLCAe++9l9LSUu64447o8wD8/ve/77NdXdejiRZd16mvr+exxx5j/fr1/OpXv4pZ98QTT4xJvlx77bU4HA4eeeQR3G43APPmzePYY4/loYce4uc//znLli0jPz+f888/H5PJxOzZs3E6ndE7j9asWUMgEODSSy8lKysLgOzsbN555x18Pl/0efdFJBLh5z//OTNnzgR6RrTcd999nHPOOdEEw1FHHYXX6+Wmm27ie9/73n7VImltbeXdd9/lmmuuAXpG1Jx00km89NJL0X0B8Nhjj9Hd3c2LL74YfU1lZWUcf/zx0eeqqKjgmWee4Zprrokey6OOOgpFUbj//vs577zzSElJ2efYhBBCCCGEOBCGYXDZZZfxrW99C4AZM2bw1ltv8f7770cTGHa7nUsuuQSr1QqA1+vl888/xzCMAUeXH4hvfetb0Rt55s2bx9tvv83777+/XwmMHTt2cOSRR/ZZ3nvNsyuz+cC6yTZu3Mibb77JL37xC77zne9E462urmbp0qXMnz+fv/zlL5x99tn88pe/jD5u4sSJnH/++Tz//POcf/75LFu2jCOPPJKTTz4ZgDlz5uB0OklLSwPgpZdeYv369TzzzDOUlZUBcPTRR3PhhRdy5513RkfFCyHEaCIJDCGEGCZTp07l17/+NdBzUdDR0cGHH37IXXfdhc/n4+qrr45Zf9myZeTl5UWTF71OO+00nnvuOVavXk11dTVA9Adur5NPPpkbbriBpUuXMmfOHL788kt+8pOfxKxz4okn9pvAuPHGG7nxxhtjlnk8Hn70ox9xzjnnxCyfPHlyzN9Llixh9uzZ2O326AWB2+1m5syZfPzxxwDMnTuXp59+mjPOOINjjz2WBQsWcOqpp0YvdMrKyrDZbJx11lmccMIJHH300cyZM4fS0tIB9uye7RrjypUrCQQCLFq0KOaCZdGiRUDP1FX7k8B4+eWX0TSNhQsX0tHRAcBxxx3Hs88+y+uvvx5N7ixZsoTy8vJo8gLoc2yXLFmCYRj9xvaXv/yFFStWcOyxx+7nqxdCCCGEEGL/7fo71Wq1kpqais/nA2DWrFncddddnHLKKRx//PEsWLCAo446igULFgxpHNBzY1NvHPtK1/U+yyKRSL83l23YsGH/AvzKihUrAPj6178es7x3SqwPPviAUCjEKaecEtM+c+Z
"text/plain": [
"<Figure size 1600x1200 with 4 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"\n",
"sns.set(style=\"whitegrid\")\n",
"\n",
"plt.figure(figsize=(16, 12))\n",
"\n",
"# Визуализация взаимосвязи уровня давления и возраста\n",
"plt.subplot(2, 2, 1)\n",
"sns.scatterplot(x=df_cleaned['BloodPressure'], y=df_cleaned['Age'], hue=df_cleaned['Outcome'], palette='Set1', alpha=0.6)\n",
"plt.title('BloodPressure_Age')\n",
"\n",
"# Визуализация взаимосвязи уровня инсулина и уровня глюкозы\n",
"plt.subplot(2, 2, 2)\n",
"sns.scatterplot(x=df_cleaned['Insulin'], y=df_cleaned['Glucose'], hue=df_cleaned['Outcome'], palette='Set1', alpha=0.6)\n",
"plt.title('Insulin_Glucose')\n",
"\n",
"# Визуализация взаимосвязи индекса массы тела и возраста\n",
"plt.subplot(2, 2, 3)\n",
"sns.scatterplot(x=df_cleaned['BMI'], y=df_cleaned['Age'], hue=df_cleaned['Outcome'], palette='Set1', alpha=0.6)\n",
"plt.title('BMI_Age')\n",
"\n",
"# Визуализация взаимосвязи уровня глюкозы и индекса массы тела\n",
"plt.subplot(2, 2, 4)\n",
"sns.scatterplot(x=df_cleaned['Glucose'], y=df_cleaned['BMI'], hue=df_cleaned['Outcome'], palette='Set1', alpha=0.6)\n",
"plt.title('Glucose_BMI')\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## KMeans (неиерархическая кластеризация) для сравнения"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.cluster import KMeans\n",
"from sklearn.preprocessing import StandardScaler\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"\n",
"# Масштабирование данных\n",
"scaler = StandardScaler()\n",
"data_scaled = scaler.fit_transform(df_cleaned[['Glucose', 'BMI', 'BloodPressure', 'Age']])\n",
"\n",
"# Обучение K-Means\n",
"random_state = 17\n",
"kmeans = KMeans(n_clusters=4, random_state=random_state)\n",
"labels = kmeans.fit_predict(data_scaled)\n",
"centers = kmeans.cluster_centers_\n",
"\n",
"# Обратная стандартизация центров кластеров\n",
"centers = scaler.inverse_transform(centers)\n",
"print(\"Центры кластеров:\\n\", centers)\n",
"\n",
"# Визуализация кластеризации\n",
"plt.figure(figsize=(16, 12))\n",
"\n",
"# Взаимосвязь Glucose и BMI\n",
"plt.subplot(2, 2, 1)\n",
"sns.scatterplot(x=df_cleaned['Glucose'], y=df_cleaned['BMI'], hue=labels, palette='Set1', alpha=0.6)\n",
"plt.scatter(centers[:, 0], centers[:, 1], s=300, c='red', label='Centroids')\n",
"plt.title('KMeans Clustering: Glucose vs BMI')\n",
"plt.legend()\n",
"\n",
"# Взаимосвязь Glucose и Age\n",
"plt.subplot(2, 2, 2)\n",
"sns.scatterplot(x=df_cleaned['Glucose'], y=df_cleaned['Age'], hue=labels, palette='Set1', alpha=0.6)\n",
"plt.scatter(centers[:, 0], centers[:, 3], s=300, c='red', label='Centroids')\n",
"plt.title('KMeans Clustering: Glucose vs Age')\n",
"plt.legend()\n",
"\n",
"# Взаимосвязь BloodPressure и BMI\n",
"plt.subplot(2, 2, 3)\n",
"sns.scatterplot(x=df_cleaned['BloodPressure'], y=df_cleaned['BMI'], hue=labels, palette='Set1', alpha=0.6)\n",
"plt.scatter(centers[:, 2], centers[:, 1], s=300, c='red', label='Centroids')\n",
"plt.title('KMeans Clustering: BloodPressure vs BMI')\n",
"plt.legend()\n",
"\n",
"# Взаимосвязь BloodPressure и Age\n",
"plt.subplot(2, 2, 4)\n",
"sns.scatterplot(x=df_cleaned['BloodPressure'], y=df_cleaned['Age'], hue=labels, palette='Set1', alpha=0.6)\n",
"plt.scatter(centers[:, 2], centers[:, 3], s=300, c='red', label='Centroids')\n",
"plt.title('KMeans Clustering: BloodPressure vs Age')\n",
"plt.legend()\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}