379 lines
926 KiB
Plaintext
379 lines
926 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Бизнес-цель\n",
|
|||
|
"Анализ ключевых факторов, влияющих на диабет. Предсказание вероятности развития диабета на основе медданных. Актуальность для планирвоания лечения.\n",
|
|||
|
"1. Уровень давления(BloodPressure) и возраст(Age) - с возрастом артериальное давление может увеличиться, что является фактором риска для диабета.\n",
|
|||
|
"2. Уровень инсулина(Insulin) и уровень глюкозы(Glucose) - уровень инсулина напрямую влияет на уровень сахара в крови.\n",
|
|||
|
"3. Индекс массы тела(BMI) и возраст(Age) - с повышением возраста зачастую увеличивается индекс массы тела.\n",
|
|||
|
"4. Уровень глюкозы(Glucose) и индекс массы тела(BMI) - как индекс массы тела влияет на уровень глюкозы."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 12,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
" Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
|
|||
|
"0 6 148 72 35 0 33.6 \n",
|
|||
|
"1 1 85 66 29 0 26.6 \n",
|
|||
|
"2 8 183 64 0 0 23.3 \n",
|
|||
|
"3 1 89 66 23 94 28.1 \n",
|
|||
|
"4 0 137 40 35 168 43.1 \n",
|
|||
|
"\n",
|
|||
|
" DiabetesPedigreeFunction Age Outcome \n",
|
|||
|
"0 0.627 50 1 \n",
|
|||
|
"1 0.351 31 0 \n",
|
|||
|
"2 0.672 32 1 \n",
|
|||
|
"3 0.167 21 0 \n",
|
|||
|
"4 2.288 33 1 \n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"import numpy as np\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"from scipy.cluster.hierarchy import dendrogram, linkage, fcluster\n",
|
|||
|
"from sklearn.cluster import KMeans\n",
|
|||
|
"from sklearn.decomposition import PCA\n",
|
|||
|
"from sklearn.preprocessing import StandardScaler\n",
|
|||
|
"from sklearn.metrics import silhouette_score\n",
|
|||
|
"\n",
|
|||
|
"df = pd.read_csv(\"data/diabetes.csv\")\n",
|
|||
|
"df = df.head(1500)\n",
|
|||
|
"print(df.head())"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Очистка данных"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 13,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
" Glucose BloodPressure SkinThickness Insulin BMI Age Outcome\n",
|
|||
|
"0 148 72 35 0 33.6 50 1\n",
|
|||
|
"1 85 66 29 0 26.6 31 0\n",
|
|||
|
"2 183 64 0 0 23.3 32 1\n",
|
|||
|
"3 89 66 23 94 28.1 21 0\n",
|
|||
|
"4 137 40 35 168 43.1 33 1\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"df_cleaned = df.drop(columns=['Pregnancies', 'DiabetesPedigreeFunction'], errors='ignore').dropna()\n",
|
|||
|
"print(df_cleaned.head())"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Визуализация парных взаимодействий"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 14,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAABjAAAASgCAYAAABWngGUAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3xb9b0//pekoy1LtuVtJ3HiLMgw2QmQwd75lVFomZdCyyhlj3LhFrgUCoWywiijFMooo3BZ5duyobQkJGGFTJLYGd5TsqSjcaTz+8ORYtmyLclax349H48+aM45OudzzudY47w/789bJcuyDCIiIiIiIiIiIiIiohyiznYDiIiIiIiIiIiIiIiI+mMAg4iIiIiIiIiIiIiIcg4DGERERERERERERERElHMYwCAiIiIiIiIiIiIiopzDAAYREREREREREREREeUcBjCIiIiIiIiIiIiIiCjnMIBBREREREREREREREQ5hwEMIiIiIiIiIiIiIiLKOQxgEBERERERERERERFRzmEAg4goA8455xxMmzYt6n/z58/Hueeeiy+//DKy3eGHH45f//rXGWnT3r17MW3aNLz++utR/+77v+nTp2POnDk45ZRT8Le//S0j7VKi+vp6TJs2DYsWLYLf7892c4iIiIiI4rZq1SpMmzYt48c955xzcM4550T+PW3aNKxatSptx2tubsa9996LE088EXPmzMGcOXNw8skn44knnoAoilHbprstREQUPyHbDSAiGisOPPBA3HLLLQCAYDCIrq4u/PWvf8UFF1yA119/HVOmTMlyC3tdcsklWLFiBQBAlmW43W68+uqruOmmmyBJEn7yk59kt4E56LXXXkNNTQ127dqFf/zjH1i5cmW2m0REREREpCgvv/wyysrK0rLvNWvW4PLLL4fNZsOZZ56JadOmIRQKYc2aNXjsscfw3nvv4YUXXoBer0/L8YmIKHkMYBARZYjFYsFBBx0Utezggw/GkiVL8Prrr+OGG27ITsP6GT9+fMx2btmyBc888wwDGP0Eg0G88cYbOOOMM/D111/jpZdeYgCDiIiIiChB/X+DpEpnZyeuuuoqVFdX489//jNMJlNk3SGHHIIjjjgCP/3pT/Hss8/iF7/4RVraQEREyeMUUkREWWQ0GqHX66FSqWKu7+npwe9+9zsceeSRmDVrFk488cQBUzkFg0G88MILOOmkkzB79mysWLEC9957L3w+X9R27733HlauXInZs2fj5JNPxpYtW+Jup1qtxgEHHIDGxkYA+6eb+vOf/4xjjz0WtbW1eO211wAA27Ztw0UXXYS5c+di7ty5+OUvf4k9e/ZE7e/ZZ5/Fsccei1mzZmHp0qW49dZb4XK5Iuv//e9/4/TTT8ecOXOwYMECXHLJJdixY0dkfayptl5//XVMmzYNe/fuBdCbCn/UUUfh4YcfxsKFC3HooYfC4XAAAF599VWccMIJmDlzJlasWIFVq1YhGAzGfT36+vzzz9Ha2ooVK1Zg5cqVWL9+PbZv3z5gux07duDnP/855s6di4MPPhj3338/brzxxqi0+VAohCeeeAJHHXUUZs6ciWOOOQbPPfdcUu0iIiIiIkrU66+/jgMPPBDffvstzjjjDMyaNQuHHXYY/vSnP0Vt984770R+WyxevBjXXnstWlpaIutjTcE03FRVfV+zZs0aTJs2DV988QV+9rOfoba2FocccgjuueeehL+3v/jii+jo6MBvf/vbqOBFWG1tLc4777yY64CBvzPC+v8m8fv9eOCBB3DEEUdg9uzZOPHEE/F///d/Ua959913ccopp2DOnDk45JBD8Jvf/CbyGwUAvF4vbr31VixbtgwzZ87EscceO+Dad3d34ze/+Q0OPvhgzJo1C6effjq++OKLhK4JEZGSMAODiChDZFmGJEmR/9/d3Y1nn30Wfr8fp5566oDtvV4vzjzzTHR0dODyyy9HZWUlPvjgA9x0001ob2/HxRdfDAD4zW9+gzfffBM///nPMX/+fGzatAmPPPIINm/ejKeeegoqlQofffQRLr/8cpx00km47rrrsHnzZlx33XUJtb+urg7jx4+PWrZq1SrcdNNNsFgsqK2tRV1dHX7yk59g0qRJuPvuuyFJEh577DH89Kc/xZtvvgm73Y533nkH99xzD2644QZMmzYNO3fuxN133w1RFHH33Xdjz549uPTSS3Hqqafi6quvhtPpxH333Ydf/OIXeP/996FWxx97b2xsxKeffor7778f3d3dsNlsePzxx3H//ffj7LPPxo033ojNmzdj1apVaGpqwp133pnQNQF6p4+aMmUKZs6ciZqaGtx222146aWXcPPNN0e26ezsxNlnnw273Y7f/e53CAaDePDBB9HY2Bg10uzWW2/F66+/josuughz5szB2rVrceedd8LpdOKXv/xlwm0jIiIiIkpUKBTClVdeif/6r//ClVdeib/97W/4/e9/j6lTp2Lp0qVYv349rr/+elx66aVYsGABmpubcc899+Caa67B888/n9K2XHvttTjzzDPx85//HJ988gmeeuopjBs3LqGs8A8//BDTpk0bcsreVGTDX3vttfj0009xySWXoLa2Fp9++il+/etfQ6vV4sQTT8Sjjz6Khx56CGeeeSauuuoq7NmzBw8++CC++eYbvPLKKzAYDLjzzjvx+eef44YbbkBRURE+++wz/P73v0d+fj5OPfVU+Hw+nHfeeWhvb8dVV12FkpISvPbaa7jwwgvx1FNPYcmSJSM+DyKiXMMABhFRhqxduxYzZswYsPzqq69GTU3NgOWvv/46tm3bhpdeeglz5swBACxduhSSJOHRRx/FT37yE7S3t+Nvf/sbrrnmmki68yGHHIKSkhJcf/31+Oyzz7B8+XI88sgjmD17Nu65557IfgDgD3/4w4DjhkKhSKAlFAqhpaUFzz33HLZs2YJbb701atvjjjsuKvhyzTXXwGg04plnnoHFYgEALFmyBEceeSSeeuop3HDDDfjyyy9RVVWFs846C2q1GgsXLoTJZIqMPPruu+/g9Xpx0UUXobS0FABQVlaGDz/8EB6PJ7LfeEiShBtuuAHz588H0JvR8uijj+KMM86IBBgOPfRQ5Ofn4+abb8b555+fUC2Srq4ufPTRR7j66qsB9GbUHH/88XjzzTcj1wIAnnvuObjdbrzxxhuRc6qtrcUxxxwT2VddXR1eeeUVXH311ZG+PPTQQ6FSqfD444/jzDPPREFBQdxtIyIiIiJKhizLuPTSS/HjH/8YADBv3jy8//77+OSTTyIBDIPBgF/84hfQ6XQAgPz8fGzYsAGyLA+aXZ6MH//4x5GBPEuWLMEHH3yATz75JKEAxu7du3HIIYcMWB7+zdOXICT3mGzbtm345z//if/+7//GeeedF2lvQ0MD1qxZg6VLl+Kxxx7D6aefjt/85jeR102dOhVnnXUWXnvtNZx11ln48ssvccghh+CEE04AACxatAgmkwl2ux0A8Oabb2LLli145ZVXUFtbCwBYtmwZzjnnHNx7772RrHgiotGEAQwiogyZMWMGbrvtNgC9PwqcTic+++wz3H///fB4PLjqqquitv/yyy9RWVkZCV6ErVy5En/729/w7bffoqGhAQAiX3DDTjjhBNx4441Ys2YNFi1ahI0bN+KKK66I2ua4446LGcC46aabcNNNN0Uty8vLwyWXXIIzzjgjavkBBxwQ9e/Vq1dj4cKFMBgMkR8EFosF8+fPx3/+8x8AwOLFi/Hyyy/jlFNOwZFHHonly5fjpJNOivzQqa2thV6vx2mnnYZjjz0Wy5Ytw6JFizB79uxBruzQ+rbx66+/htfrxeGHHx71g+Xwww8H0Dt1VSIBjLfeegvBYBArVqyA0+kEABx11FF49dVX8e6770aCO6tXr8acOXMiwQsAA/p29erVkGU5Ztsee+wxrF+/HkceeWSCZ09ERERElLi+31N1Oh0KCwvh8XgAAAsWLMD999+PE088EccccwyWL1+OQw89FMuXL09rO4DegU3hdsQrFAoNWCZJUszBZVu3bk2sgfusX78eAHD00UdHLQ9
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1600x1200 with 4 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import seaborn as sns\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"\n",
|
|||
|
"sns.set(style=\"whitegrid\")\n",
|
|||
|
"\n",
|
|||
|
"plt.figure(figsize=(16, 12))\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация взаимосвязи уровня давления и возраста\n",
|
|||
|
"plt.subplot(2, 2, 1)\n",
|
|||
|
"sns.scatterplot(x=df_cleaned['BloodPressure'], y=df_cleaned['Age'], alpha=0.6)\n",
|
|||
|
"plt.title('BloodPressure_Age')\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация взаимосвязи уровня инсулина и уровня глюкозы\n",
|
|||
|
"plt.subplot(2, 2, 2)\n",
|
|||
|
"sns.scatterplot(x=df_cleaned['Insulin'], y=df_cleaned['Glucose'], alpha=0.6)\n",
|
|||
|
"plt.title('Insulin_Glucose')\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация взаимосвязи индекса массы тела и возраста\n",
|
|||
|
"plt.subplot(2, 2, 3)\n",
|
|||
|
"sns.scatterplot(x=df_cleaned['BMI'], y=df_cleaned['Age'], alpha=0.6)\n",
|
|||
|
"plt.title('BMI_Age')\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация взаимосвязи уровня глюкозы и индекса массы тела\n",
|
|||
|
"plt.subplot(2, 2, 4)\n",
|
|||
|
"sns.scatterplot(x=df_cleaned['Glucose'], y=df_cleaned['BMI'], alpha=0.6)\n",
|
|||
|
"plt.title('Glucose_BMI')\n",
|
|||
|
"\n",
|
|||
|
"plt.tight_layout()\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Стандартизация данных для кластеризации"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 15,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"scaler = StandardScaler()\n",
|
|||
|
"data_scaled = scaler.fit_transform(df_cleaned)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Агломеративная (иерархическая) кластеризация"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 17,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA1AAAAJxCAYAAABMnFMWAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAADQhklEQVR4nOzdeZxT1fnH8W8myewrszEMMOybyCKiUhUUELViLdRaq1K19eeGS12qVuu+1oKKola6qCDuULV1R6zaiii44I6I7DvDLCwzyST39wfNJckkmZtMZpIwn/fr5cshy70n99wk98lzznNshmEYAgAAAAC0KC3RDQAAAACAVEEABQAAAAAWEUABAAAAgEUEUAAAAABgEQEUAAAAAFhEAAUAAAAAFhFAAQAAAIBFBFAAAAAAYBEBFAAAAABYRAAFwJJrrrlG/fv3D/nfNddck+jmAfBTW1urESNGaNmyZaqtrdUFF1ygv//974luFgDsFxyJbgCA1FFaWqqZM2cG3HbRRRclqDUAwikoKNDZZ5+tU045RYZhqH///vrjH/+Y6GYBwH6BAAqAJR6PR9nZ2Ro2bFjA7enp6YlpEICILrroIp166qmqq6tTVVWV7HZ7opsEAPsFhvABsKSpqUmZmZmWHrtkyRKdccYZGjp0qA455BBdffXVqq6uNu+fP3+++vfvr3Xr1gU8b+zYsQHDAd1ud9hhg8Hb+uyzzzRp0iQNGTJEJ554ol577bWAbdfX1+vOO+/U+PHjdeCBB2rixIl6/vnnm+0/eD/r1q3TlClTdM011+jPf/6zfvSjH2nEiBG68MILtX79+oDnL1iwQKeddpqGDx+uwYMH67jjjtPcuXPN+xcvXmxud+nSpQHPfeKJJ9S/f3+NHTu2WXv+8Ic/BDy2trZWgwcPVv/+/bV48WLL+w/nueee0+TJkzVs2DANGTJEJ510kl599dVmxzjUsM1w/TNlypSAfbzyyiuaPHmyhg8frsMPP1w33HCDamtrzfsfeOAB9e/fX8OHD5fL5Qp47iWXXNJsqGhjY6PuvvtujRkzRoMHD9aJJ56oV155JeB5Y8eO1b333qs77rhDI0eO1KGHHqqrrrpKNTU1ll9/pKGr8+fPN/vUvx+2b9+ugw8+OGRf9u/fXwMGDNDIkSN18cUXa8eOHeZj+vfvrwceeCCgbb7jEsuxlKSSkhL16tVL77//fovDbYP39fLLL2vkyJGaPn26pMDzN/g//3Z/8803uuiii3TYYYfpgAMO0JFHHqnbbrtNDQ0N5mNcLpfuu+8+jRs3TkOGDNHEiRP1j3/8w9Ixl6QNGzbo8ssv1yGHHKKhQ4fqzDPP1FdffWVuf926derfv79efvllnX/++Ro6dKiOOuooPfjgg/J6vQH9EnxMLr/88oA+NQxDM2bM0JFHHqkRI0bo/PPP18aNG83HezwezZo1SxMnTtSQIUM0bNgwnXrqqfrggw8i9qPUvM+D/20Yhk499dSAz8trrrkm4NySpKeffjrk+QOgbZCBAmDJnj17VFBQ0OLjPvroI5199tk67LDDdN9996m2tlYzZszQr371Kz3//POWgzBp70WyJD388MPq1KmTpL0Xu8GBjySdd955OuOMM3TZZZfp+eef129/+1s98sgjGjNmjBoaGnTaaadp+/btuuSSS1RZWakFCxbouuuu07Zt23T++eeb2xkzZowuvPBC899lZWWSpLfeektFRUX6wx/+IK/Xq+nTp2vKlCl6+eWXlZWVpX//+9+aOnWqfvWrX+niiy9WQ0ODnnzySd1yyy0aPHiwhg4dam4zJydHCxcu1IgRI8zbXnnlFaWlNf9NKycnR//+979lGIZsNpsk6Y033pDH4wl4XDT79zd37lzddtttuvjiizVixAjV1tbqL3/5i6688koNHz5cnTt3Nh87c+ZMlZaWSpLZH5J08skn6+c//7n575tvvjlgHw899JDuv/9+nXbaabrsssu0du1azZgxQ59++qmeffbZgHPCZrNp0aJFGjNmjCRp165deueddwKOjWEYmjp1qj7++GNdcskl6t27t958801ddtllcrlc+ulPf2o+9sknn1RVVZXuvPNOVVdXa/r06Vq9erWefvpp2Wy2Fl//hRdeqFNPPVXS3ozOoEGDzPOje/fu+u6775od0+nTp6u+vl75+fkBt/vOLbfbre+//1533323br/9dk2bNi1k34QSzbH0cbvduuOOOyzvQ5IaGhp0yy236JxzztGJJ54YcN8NN9ygAw44wPz3L37xC/PvLVu26PTTT9ewYcN01113KT09Xe+++64effRRlZWV6dxzz5UkXXnllXrnnXd0wQUXaOjQoXrnnXd0zTXXyOl0tnjMq6urdeqppyorK0vXX3+9srKy9Pjjj+v000/X888/r969e5vtuemmmzRmzBg98MADWrp0qWbOnKndu3frd7/7XcjXvWTJEr388ssBtz322GN65JFHdNVVV6lnz5666667dOmll+rZZ5+VJE2bNk1PPfWUrrjiCvXv31+bN2/Wgw8+qEsvvVT//ve/lZWVFdWx9/fiiy/qk08+ifiY2tpa3XfffTHvA0D0CKAAWFJTU2MGE5FMnz5dPXv21COPPGIOGRo6dKhOOOEEzZs3T6effrrlfe7evVuSNHz4cBUVFUmS3nvvvZCPnTJliqZOnSpJOvLIIzVp0iQ9+OCDGjNmjObPn6/ly5fr6aef1vDhw83HNDU16aGHHtKpp56qwsJCSXsDg+BhitLeAHL+/Pnq1q2bJKlXr16aNGmSXnjhBf3yl7/UihUrNGnSJF133XXmc4YPH65DDz1UixcvDghgRo8erbfeesu8iNu0aZM++eQTHXzwwc2yWqNGjdI777yjzz77zGzXq6++qpEjRwZkPaLZv7+1a9fqN7/5TUDQWFlZqcmTJ2vp0qU64YQTzNsHDhyorl27NttG586dA45Zbm6u+Xdtba0efvhhnXLKKbrhhhvM2/v166fTTz+92TnhOza+AGrhwoUqLS0NyBq8//77eu+993Tvvffqxz/+saS9/blnzx5NmzZNEydOlMOx9+stLS1Njz76qPLy8iTt7d+pU6fqvffe0+jRoy29/u7du0vaO1w13Pnh8/nnn+vFF1/UwIEDVVdXF3Cf/3NHjhyp999/X19++WXYbQWL9lj6zJkzR7t371ZJSYnlff3rX/+S0+nUOeec02zoX58+fcIeg+XLl2vgwIGaMWOGeR786Ec/0n//+18tXrxY5557rpYvX67XX39d1157rc4880xJe8/z9evXa/HixZo4cWLEY37vvfeqpqZGTz31lCorKyXtPW9+/OMfa8aMGbr//vvNxx5wwAFmgDp69Gjt3r1bjz/+uC644IKA81SSvF6vbrvtNh1wwAEB/bJ7925deOGFOuussyTtzW7dcsstqqurU35+vrZs2aLLLrssIOuakZGhiy++WN9++23E8yWSXbt2adq0ac3aE+z+++9Xly5dArKZANoWQ/gAWLJlyxaVl5dHfMyePXv02WefacyYMTIMQ01NTWpqalK3bt3Uu3dv/fe//w14vNfrNR/T1NTUbHubNm1SWlpaswudUCZNmmT+bbPZdMwxx2jZsmVqaGjQhx9+qMrKSjN48vnJT36ixsZGffbZZy1u/6CDDjKDJ0kaNGiQunXrpo8++kiSdM455+iuu+7Srl279MUXX+iVV17RI488IknNhqSNHTtWq1at0sqVKyVJr732moYOHWpeDPrLy8vTIYccorfeekuSVF1drcWLFwcENtHu398111yjK6+8UnV1dfr000/14osvmsP+Ij3Pqk8//VQul0sTJ04MuP3ggw9WZWWlPvzww4Dbx40bp4ULF8owDEl7M3O+IMln0aJFstlsGjNmTMD5M3bsWG3dujUgKzR27FgzePL92+FwmP0Wz9dvGIZuu+02nXzyyRowYEDI+5uamuRyubRs2TItXbpUgwcPDnhM8HvCP3CM9lhK0rZt2/Tggw/q6quvVkZGhqXXsXnzZv3lL3/RaaedFvW8qSOOOEJPPPGEMjIytGLFCr311lt
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x700 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"[ 4 15 6 15 2 15 2 14 1 17 8 6 7 1 3 13 4 6 10 2 12 8 6 2\n",
|
|||
|
" 3 2 6 15 7 8 7 3 15 15 10 10 8 2 2 4 12 8 7 3 8 4 15 15\n",
|
|||
|
" 2 16 15 15 15 3 12 15 3 10 8 10 16 6 15 11 6 10 2 8 15 11 2 11\n",
|
|||
|
" 6 12 10 9 8 10 13 15 15 16 10 15 6 10 10 10 4 15 15 11 10 5 11 12\n",
|
|||
|
" 10 15 15 4 6 15 8 15 8 11 8 12 10 2 3 1 10 8 3 5 6 8 15 15\n",
|
|||
|
" 4 10 10 7 6 2 10 10 2 5 3 6 3 10 15 11 10 15 8 12 7 10 15 6\n",
|
|||
|
" 12 17 10 10 7 15 12 8 3 1 6 4 15 15 15 4 12 10 12 15 6 2 11 15\n",
|
|||
|
" 15 15 6 2 14 10 15 3 8 4 8 6 15 11 9 15 8 4 1 4 4 3 15 10\n",
|
|||
|
" 6 13 15 4 15 2 2 3 15 8 15 15 12 15 3 6 10 4 15 10 12 4 2 4\n",
|
|||
|
" 2 10 2 6 1 5 14 7 15 10 8 4 1 10 6 4 15 8 15 6 3 4 4 15\n",
|
|||
|
" 15 10 6 2 12 5 8 1 12 15 8 8 15 10 2 2 10 15 11 3 12 13 10 7\n",
|
|||
|
" 6 10 13 10 15 13 2 15 8 10 8 10 2 15 7 11 6 12 11 5 5 7 1 4\n",
|
|||
|
" 15 10 10 2 4 2 7 12 3 11 2 7 13 2 10 6 8 10 3 11 2 2 15 10\n",
|
|||
|
" 3 15 2 11 15 6 10 5 11 2 2 3 10 11 2 8 4 10 7 10 13 8 15 12\n",
|
|||
|
" 14 6 3 6 11 10 9 8 8 12 15 14 15 9 8 8 10 15 8 5 2 13 10 3\n",
|
|||
|
" 3 7 7 5 12 10 6 15 15 3 1 17 10 10 10 4 15 10 6 10 10 15 11 15\n",
|
|||
|
" 11 15 2 2 3 10 10 6 11 15 6 11 10 2 15 4 6 7 4 10 6 10 6 15\n",
|
|||
|
" 6 1 10 11 12 11 2 1 15 4 15 2 12 15 10 15 4 3 16 3 12 2 14 10\n",
|
|||
|
" 15 8 15 13 12 8 15 8 4 10 11 6 2 4 15 10 2 11 15 6 10 14 10 4\n",
|
|||
|
" 7 10 4 7 7 15 10 10 8 15 15 10 13 12 10 10 10 7 15 7 4 15 10 7\n",
|
|||
|
" 3 10 15 10 13 4 1 12 15 7 10 10 10 2 16 7 15 15 3 12 11 10 9 10\n",
|
|||
|
" 10 8 4 11 15 7 2 11 7 15 15 3 3 8 8 7 15 11 16 6 15 15 15 11\n",
|
|||
|
" 11 15 15 8 10 14 10 13 8 7 12 4 4 2 2 10 10 3 3 11 12 12 15 10\n",
|
|||
|
" 7 15 10 10 10 7 10 8 6 3 10 15 8 15 10 10 12 2 8 8 15 11 12 10\n",
|
|||
|
" 15 6 8 4 4 15 7 8 1 15 6 15 3 14 2 12 6 15 12 3 8 14 6 15\n",
|
|||
|
" 15 14 10 3 13 10 3 15 12 11 15 3 3 10 3 15 8 15 2 13 12 15 8 10\n",
|
|||
|
" 15 10 15 8 8 15 6 10 15 11 15 6 8 15 4 15 15 8 5 14 11 1 3 4\n",
|
|||
|
" 3 15 15 11 10 15 10 1 15 12 8 2 7 4 3 4 2 12 5 2 10 12 12 15\n",
|
|||
|
" 10 12 7 6 5 8 6 11 15 4 10 6 17 11 15 15 11 4 15 6 10 4 15 1\n",
|
|||
|
" 3 14 11 8 11 2 4 14 11 10 16 11 5 2 11 10 2 11 15 3 3 7 10 2\n",
|
|||
|
" 10 10 3 12 8 10 11 11 8 15 2 6 4 15 7 10 11 10 11 6 4 15 15 5\n",
|
|||
|
" 12 10 4 10 3 5 6 10 15 1 4 4 12 6 8 5 15 4 15 12 10 11 6 10]\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"linkage_matrix = linkage(data_scaled, method='ward')\n",
|
|||
|
"plt.figure(figsize=(10, 7))\n",
|
|||
|
"dendrogram(linkage_matrix)\n",
|
|||
|
"plt.title('Дендрограмма агломеративной кластеризации')\n",
|
|||
|
"plt.xlabel('Индекс образца')\n",
|
|||
|
"plt.ylabel('Расстояние')\n",
|
|||
|
"plt.show()\n",
|
|||
|
"\n",
|
|||
|
"# Получение результатов кластеризации с заданным порогом\n",
|
|||
|
"result = fcluster(linkage_matrix, t=10, criterion='distance')\n",
|
|||
|
"print(result)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Визуализация распределения кластеров"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAABjAAAASgCAYAAABWngGUAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3wc1dXw8d/sbC/SqnfJtlzkJiHcAWPjQOgklAChpZAHEiCElgQCb0geHkgIJCShJJQQCIQOoQdCJwZsY+MCuNuyrd779pl5/xBaey3JVdKu5PP9JB+subM7Z2dmy9wz9x7FMAwDIYQQQgghhBBCCCGEEEKIBGKKdwBCCCGEEEIIIYQQQgghhBC7kwSGEEIIIYQQQgghhBBCCCESjiQwhBBCCCGEEEIIIYQQQgiRcCSBIYQQQgghhBBCCCGEEEKIhCMJDCGEEEIIIYQQQgghhBBCJBxJYAghhBBCCCGEEEIIIYQQIuFIAkMIIYQQQgghhBBCCCGEEAlHEhhCCCGEEEIIIYQQQgghhEg4ksAQQgghhBBCCCGEEEIIIUTCkQSGEEIMgwsvvJBJkybF/H/mzJlcdNFFLFu2LLreokWLuP7664clpqqqKiZNmsQLL7wQ8/eu/y8pKaG8vJwzzjiD5557bljiGom2bdvGpEmTmDNnDqFQKN7hCCGEEEIIsc/uvvtuJk2aNOzbvfDCC7nwwgujf0+aNIm77757yLZXV1fHnXfeySmnnEJ5eTnl5eWcfvrpPPDAA/j9/ph1hzoWIYQQ+84c7wCEEOJQMWXKFG6++WYANE2jtbWVJ598kosvvpgXXniBCRMmxDnCHj/60Y9YuHAhAIZh0N3dzbPPPsuNN95IJBLh3HPPjW+ACej555+nuLiY7du388Ybb3DaaafFOyQhhBBCCCFGlKeffprs7Owhee6lS5dy5ZVXkpyczHnnncekSZPQdZ2lS5fyl7/8hf/85z/885//xGazDcn2hRBCHDhJYAghxDBxu90cdthhMcuOOOII5s2bxwsvvMDPf/7z+AS2m8LCwn7jXL9+PY888ogkMHajaRovvvgi55xzDitXruSpp56SBIYQQgghhBD7afdrkMHS0tLC1VdfzZgxY/j73/+O0+mMth155JF87Wtf49vf/jaPPvool1xyyZDEIIQQ4sDJFFJCCBFHDocDm82Goij9tnd2dvKb3/yGY489lunTp3PKKaf0mcpJ0zT++c9/cuqpp1JaWsrChQu58847CQaDMev95z//4bTTTqO0tJTTTz+d9evX73OcJpOJyZMnU1NTA+ycburvf/87J5xwAmVlZTz//PMAbNy4kUsvvZTDDz+cww8/nMsvv5zKysqY53v00Uc54YQTmD59OvPnz+dXv/oVXV1d0faPPvqIs88+m/LycmbNmsWPfvQjtmzZEm3vb6qtF154gUmTJlFVVQX0DIU/7rjjuOeee5g9ezZHHXUU7e3tADz77LOcfPLJTJs2jYULF3L33Xejado+749dLV68mIaGBhYuXMhpp53GihUr2Lx5c5/1tmzZwv/8z/9w+OGHc8QRR3DXXXdxww03xAyb13WdBx54gOOOO45p06Zx/PHH89hjjx1QXEIIIYQQQuyvF154gSlTprB69WrOOeccpk+fzjHHHMPf/va3mPVeffXV6LXF3Llzue6666ivr4+29zcF096mqtr1MUuXLmXSpEl88sknfP/736esrIwjjzySO+64Y79/tz/xxBM0Nzfzf//3fzHJi15lZWV85zvf6bcN+l5n9Nr9miQUCvHHP/6Rr33ta5SWlnLKKafwr3/9K+Yxr7/+OmeccQbl5eUceeSR/PKXv4xeowAEAgF+9atfcfTRRzNt2jROOOGEPvu+ra2NX/7ylxxxxBFMnz6ds88+m08++WS/9okQQowkMgJDCCGGiWEYRCKR6L/b2tp49NFHCYVCnHnmmX3WDwQCnHfeeTQ3N3PllVeSl5fH22+/zY033khTUxM//OEPAfjlL3/JSy+9xP/8z/8wc+ZM1q5dy7333su6det46KGHUBSFd999lyuvvJJTTz2Vn/70p6xbt46f/vSn+xV/RUUFhYWFMcvuvvtubrzxRtxuN2VlZVRUVHDuuecybtw4br/9diKRCH/5y1/49re/zUsvvURaWhqvvvoqd9xxBz//+c+ZNGkSW7du5fbbb8fv93P77bdTWVnJZZddxplnnsk111xDR0cHf/jDH7jkkkt46623MJn2PfdeU1PDBx98wF133UVbWxvJycncf//93HXXXVxwwQXccMMNrFu3jrvvvpva2lpuu+22/don0DN91IQJE5g2bRrFxcX8+te/5qmnnuKmm26KrtPS0sIFF1xAWloav/nNb9A0jT/96U/U1NTE3Gn2q1/9ihdeeIFLL72U8vJyPv30U2677TY6Ojq4/PLL9zs2IYQQQggh9peu61x11VV897vf5aqrruK5557jd7/7HRMnTmT+/PmsWLGCn/3sZ1x22WXMmjWLuro67rjjDq699loef/zxQY3luuuu47zzzuN//ud/eP/993nooYcoKCjYr1Hh77zzDpMmTdrjlL2DMRr+uuuu44MPPuBHP/oRZWVlfPDBB1x//fVYLBZOOeUU7rvvPv785z9z3nnncfXVV1NZWcmf/vQnVq1axTPPPIPdbue2225j8eLF/PznPyc9PZ0PP/yQ3/3ud3i9Xs4880yCwSDf+c53aGpq4uqrryYzM5Pnn3+eH/zgBzz00EPMmzfvoF+HEEIkGklgCCHEMPn000+ZOnVqn+XXXHMNxcXFfZa/8MILbNy4kaeeeory8nIA5s+fTyQS4b777uPcc8+lqamJ5557jmuvvTY63PnII48kMzOTn/3sZ3z44YcsWLCAe++9l9LSUu64447o8wD8/ve/77NdXdejiRZd16mvr+exxx5j/fr1/OpXv4pZ98QTT4xJvlx77bU4HA4eeeQR3G43APPmzePYY4/loYce4uc//znLli0jPz+f888/H5PJxOzZs3E6ndE7j9asWUMgEODSSy8lKysLgOzsbN555x18Pl/0efdFJBLh5z//OTNnzgR6RrTcd999nHPOOdEEw1FHHYXX6+Wmm27ie9/73n7VImltbeXdd9/lmmuuAXpG1Jx00km89NJL0X0B8Nhjj9Hd3c2LL74YfU1lZWUcf/zx0eeqqKjgmWee4Zprrokey6OOOgpFUbj//vs577zzSElJ2efYhBBCCCGEOBCGYXDZZZfxrW99C4AZM2bw1ltv8f7770cTGHa7nUsuuQSr1QqA1+vl888/xzCMAUeXH4hvfetb0Rt55s2bx9tvv83777+/XwmMHTt2cOSRR/ZZ3nvNsyuz+cC6yTZu3Mibb77JL37xC77zne9E462urmbp0qXMnz+fv/zlL5x99tn88pe/jD5u4sSJnH/++Tz//POcf/75LFu2jCOPPJKTTz4ZgDlz5uB0OklLSwPgpZdeYv369TzzzDOUlZUBcPTRR3PhhRdy5513RkfFCyHEaCIJDCGEGCZTp07l17/+NdBzUdDR0cGHH37IXXfdhc/n4+qrr45Zf9myZeTl5UWTF71OO+00nnvuOVavXk11dTVA9Adur5NPPpkbbriBpUuXMmfOHL788kt+8pOfxKxz4okn9pvAuPHGG7nxxhtjlnk8Hn70ox9xzjnnxCyfPHlyzN9Llixh9uzZ2O326AWB2+1m5syZfPzxxwDMnTuXp59+mjPOOINjjz2WBQsWcOqpp0YvdMrKyrDZbJx11lmccMIJHH300cyZM4fS0tIB9uye7RrjypUrCQQCLFq0KOaCZdGiRUDP1FX7k8B4+eWX0TSNhQsX0tHRAcBxxx3Hs88+y+uvvx5N7ixZsoTy8vJo8gLoc2yXLFmCYRj9xvaXv/yFFStWcOyxx+7nqxdCCCGEEGL/7fo71Wq1kpqais/nA2DWrFncddddnHLKKRx//PEsWLCAo446igULFgxpHNBzY1NvHPtK1/U+yyKRSL83l23YsGH/AvzKihUrAPj6178es7x3SqwPPviAUCjEKaecEtM+c+Z
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1600x1200 with 4 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import seaborn as sns\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"\n",
|
|||
|
"sns.set(style=\"whitegrid\")\n",
|
|||
|
"\n",
|
|||
|
"plt.figure(figsize=(16, 12))\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация взаимосвязи уровня давления и возраста\n",
|
|||
|
"plt.subplot(2, 2, 1)\n",
|
|||
|
"sns.scatterplot(x=df_cleaned['BloodPressure'], y=df_cleaned['Age'], hue=df_cleaned['Outcome'], palette='Set1', alpha=0.6)\n",
|
|||
|
"plt.title('BloodPressure_Age')\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация взаимосвязи уровня инсулина и уровня глюкозы\n",
|
|||
|
"plt.subplot(2, 2, 2)\n",
|
|||
|
"sns.scatterplot(x=df_cleaned['Insulin'], y=df_cleaned['Glucose'], hue=df_cleaned['Outcome'], palette='Set1', alpha=0.6)\n",
|
|||
|
"plt.title('Insulin_Glucose')\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация взаимосвязи индекса массы тела и возраста\n",
|
|||
|
"plt.subplot(2, 2, 3)\n",
|
|||
|
"sns.scatterplot(x=df_cleaned['BMI'], y=df_cleaned['Age'], hue=df_cleaned['Outcome'], palette='Set1', alpha=0.6)\n",
|
|||
|
"plt.title('BMI_Age')\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация взаимосвязи уровня глюкозы и индекса массы тела\n",
|
|||
|
"plt.subplot(2, 2, 4)\n",
|
|||
|
"sns.scatterplot(x=df_cleaned['Glucose'], y=df_cleaned['BMI'], hue=df_cleaned['Outcome'], palette='Set1', alpha=0.6)\n",
|
|||
|
"plt.title('Glucose_BMI')\n",
|
|||
|
"\n",
|
|||
|
"plt.tight_layout()\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## KMeans (неиерархическая кластеризация) для сравнения"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"from sklearn.cluster import KMeans\n",
|
|||
|
"from sklearn.preprocessing import StandardScaler\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import numpy as np\n",
|
|||
|
"\n",
|
|||
|
"# Масштабирование данных\n",
|
|||
|
"scaler = StandardScaler()\n",
|
|||
|
"data_scaled = scaler.fit_transform(df_cleaned[['Glucose', 'BMI', 'BloodPressure', 'Age']])\n",
|
|||
|
"\n",
|
|||
|
"# Обучение K-Means\n",
|
|||
|
"random_state = 17\n",
|
|||
|
"kmeans = KMeans(n_clusters=4, random_state=random_state)\n",
|
|||
|
"labels = kmeans.fit_predict(data_scaled)\n",
|
|||
|
"centers = kmeans.cluster_centers_\n",
|
|||
|
"\n",
|
|||
|
"# Обратная стандартизация центров кластеров\n",
|
|||
|
"centers = scaler.inverse_transform(centers)\n",
|
|||
|
"print(\"Центры кластеров:\\n\", centers)\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация кластеризации\n",
|
|||
|
"plt.figure(figsize=(16, 12))\n",
|
|||
|
"\n",
|
|||
|
"# Взаимосвязь Glucose и BMI\n",
|
|||
|
"plt.subplot(2, 2, 1)\n",
|
|||
|
"sns.scatterplot(x=df_cleaned['Glucose'], y=df_cleaned['BMI'], hue=labels, palette='Set1', alpha=0.6)\n",
|
|||
|
"plt.scatter(centers[:, 0], centers[:, 1], s=300, c='red', label='Centroids')\n",
|
|||
|
"plt.title('KMeans Clustering: Glucose vs BMI')\n",
|
|||
|
"plt.legend()\n",
|
|||
|
"\n",
|
|||
|
"# Взаимосвязь Glucose и Age\n",
|
|||
|
"plt.subplot(2, 2, 2)\n",
|
|||
|
"sns.scatterplot(x=df_cleaned['Glucose'], y=df_cleaned['Age'], hue=labels, palette='Set1', alpha=0.6)\n",
|
|||
|
"plt.scatter(centers[:, 0], centers[:, 3], s=300, c='red', label='Centroids')\n",
|
|||
|
"plt.title('KMeans Clustering: Glucose vs Age')\n",
|
|||
|
"plt.legend()\n",
|
|||
|
"\n",
|
|||
|
"# Взаимосвязь BloodPressure и BMI\n",
|
|||
|
"plt.subplot(2, 2, 3)\n",
|
|||
|
"sns.scatterplot(x=df_cleaned['BloodPressure'], y=df_cleaned['BMI'], hue=labels, palette='Set1', alpha=0.6)\n",
|
|||
|
"plt.scatter(centers[:, 2], centers[:, 1], s=300, c='red', label='Centroids')\n",
|
|||
|
"plt.title('KMeans Clustering: BloodPressure vs BMI')\n",
|
|||
|
"plt.legend()\n",
|
|||
|
"\n",
|
|||
|
"# Взаимосвязь BloodPressure и Age\n",
|
|||
|
"plt.subplot(2, 2, 4)\n",
|
|||
|
"sns.scatterplot(x=df_cleaned['BloodPressure'], y=df_cleaned['Age'], hue=labels, palette='Set1', alpha=0.6)\n",
|
|||
|
"plt.scatter(centers[:, 2], centers[:, 3], s=300, c='red', label='Centroids')\n",
|
|||
|
"plt.title('KMeans Clustering: BloodPressure vs Age')\n",
|
|||
|
"plt.legend()\n",
|
|||
|
"\n",
|
|||
|
"plt.tight_layout()\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": ".venv",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.12.5"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|