AIM-PIbd-31-LOBASHOV-I-D/lab_5/lab_5.ipynb

790 lines
2.0 MiB
Plaintext
Raw Normal View History

2024-11-22 23:37:27 +04:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"*Вариант 19:* Данные о миллионерах"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Перечислим атрибуты датасета\n",
"Rank - Ранг миллионера в списке.\n",
"\n",
"Name - Имя миллионера.\n",
"\n",
"Networth - Чистый капитал миллионера (в долларах США).\n",
"\n",
"Age - Возраст миллионера.\n",
"\n",
"Country - Страна проживания миллионера.\n",
"\n",
"Source - Источник богатства миллионера.\n",
"\n",
"Industry - Отрасль, в которой миллионер заработал своё состояние.\n",
"\n",
"Бизнес-цель: Анализ и визуализация данных о миллионерах для образовательных и информационных программ.\n",
"\n",
"Группировать миллионеров по \"интересным\" характеристикам для визуализации и информирования общества (например, самые молодые миллионеры, миллионеры с наибольшим капиталом, распределение миллионеров по странам и отраслям и т.д.)."
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Rank Name Networth Age Country \\\n",
"0 1 Elon Musk 219.0 50 United States \n",
"1 2 Jeff Bezos 171.0 58 United States \n",
"2 3 Bernard Arnault & family 158.0 73 France \n",
"3 4 Bill Gates 129.0 66 United States \n",
"4 5 Warren Buffett 118.0 91 United States \n",
"\n",
" Source Industry \n",
"0 Tesla, SpaceX Automotive \n",
"1 Amazon Technology \n",
"2 LVMH Fashion & Retail \n",
"3 Microsoft Technology \n",
"4 Berkshire Hathaway Finance & Investments \n"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from scipy.cluster.hierarchy import dendrogram, linkage, fcluster\n",
"from sklearn.cluster import KMeans\n",
"from sklearn.decomposition import PCA\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.metrics import silhouette_score\n",
"\n",
"df = pd.read_csv(\"C:/Users/goldfest/Desktop/3 курс/MII/AIM-PIbd-31-LOBASHOV-I-D/static/csv/Forbes Billionaires.csv\")\n",
"df = df.head(1500)\n",
"print(df.head())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Очистка данных\n",
"\n",
"Удалим несущественные данные"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Rank Networth Age Country Industry\n",
"0 1 219.0 50 United States Automotive \n",
"1 2 171.0 58 United States Technology \n",
"2 3 158.0 73 France Fashion & Retail \n",
"3 4 129.0 66 United States Technology \n",
"4 5 118.0 91 United States Finance & Investments \n"
]
}
],
"source": [
"df_cleaned = df.drop(columns=['Name', 'Source'], errors='ignore').dropna()\n",
"print(df_cleaned.head()) "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Визуализация парных взаимосвязей"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABjAAAASgCAYAAABWngGUAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdeXxU1cH/8e+smSWZLJONQNjCrogiAtqCBPtYt9KqT/vYn7W1FatWa+tarUtr1a5qtaI+tS7Vtta1T7VurbXUBREBERFR2SGQkJCVzJLMJPP7g2Y0ZoWZm7kz+bxfL17KvTNzzxzm3plzvvecY4nFYjEBAAAAAAAAAACYiDXVBQAAAAAAAAAAAPg0AgwAAAAAAAAAAGA6BBgAAAAAAAAAAMB0CDAAAAAAAAAAAIDpEGAAAAAAAAAAAADTIcAAAAAAAAAAAACmQ4ABAAAAAAAAAABMhwADAAAAAAAAAACYDgEGAGBYisViw/LYAAAAAIyX6t/8qT4+ACQLAQYAZJjLLrtMkydP1gMPPJDqoiRsxYoVmjx5sr7zne/0uv8vf/mLJk+erKqqqgN63bvvvlv3339/Mop4wDZu3KivfvWr3bZNnjxZd955Z0rKAwAAgOHrrLPO0uTJk7v9mTJlimbOnKnTTjtNTz/9tCHHvfPOOzV58mRDXvtAVFVVafLkyTrttNMUjUZ77O9qj6xYseKAXveJJ57QL37xi2QV84DU1NTo29/+tnbt2hXftnDhQl111VUpKQ8AJIoAAwAyyL59+/TPf/5TkyZN0mOPPZYxd928/PLLeuaZZ5L2enfccYdCoVDSXu9AvPjii1qzZk1Kjg0AAAB82rRp0/TYY4/F//zpT3/SjTfeKJvNpiuvvFKvvPJKqotouPXr1+t3v/td0l7vnnvuUVNTU9Je70C88cYbw+LfDMDwQYABABnk2WeflSRdc8012rZtm958880Ulyg5fD6fbr75Zu3duzfVRQEAAAAySnZ2tg4//PD4nyOPPFInn3yyHnjgATkcDv3lL39JdREN5/P5dNddd2njxo2pLgoA4FMIMAAggzz11FM6+uijNXfuXI0ZM0aPPvpoj8fcf//9Ou6443TYYYfpjDPO0L/+9a8ew6I/+ugjnXfeeZo5c6ZmzpypCy+8UDt37uzzuH/72980efJkffTRR922//Of/9TkyZP1/vvvS5IeeughnXDCCZo+fbrmzZunH//4x2ptbR3wfV1yySUKBoP68Y9/POBjd+/erUsvvVSzZ8/WjBkz9I1vfCN+fEnxoepLlizR5MmT9fDDD2vKlClqbGyMP+auu+7S5MmTtXz58m7vZcqUKdqzZ48kad26dTrnnHM0Z84czZw5U+eff363Bk/XcPNHH31UlZWVmjlzps444wwtWbIkXo5PThvV2tqqa665RrNnz9YRRxyhiy++mMAGAAAAKZOVlSWn0ymLxRLf1tDQoBtuuEGVlZU69NBDNXv2bF144YXdpnQ966yzdM011+jee+/VggULNH36dJ1xxhl69913+zzW7t27tWDBAp122mlqaWnpsf/tt9/W5MmTtXTp0m7bN2zYoMmTJ+ull16StP+GrkWLFumwww7T3Llzdfnll8d/v/fnvPPOU3Z2tq666ip1dHT0+9impiZdf/31OuaYYzR9+nR95Stf6dZuWLhwoXbt2qX/+7//i7c3PtkmkqS//vWvmjx5sp544oke76VrtPa2bdt08cUX6zOf+YwOP/xwnXXWWVq9enX88V3TXz344IM64YQTNGPGDD311FO6+uqrJUnHHXdct2mjIpGIfvnLX8Zf71vf+pa2b98+YN0AQKoRYABAhti4caPWrVunL33pS5KkL33pS3r55Ze7dYIvWbJEt9xyi0488UTdfffdmjFjhr7//e93e52tW7fqjDPOUH19vX7xi1/o5ptv1s6dO/XVr35V9fX1vR77c5/7nDwej5577rlu25999llNnDhR06ZN07PPPqtf/epXOvPMM3X//ffrwgsv1NNPP60bb7xxwPdWUVGh7373u3rppZfio0x609DQoDPOOEPr16/Xddddp1tvvVWdnZ0688wztXnzZknSY489Jkn67//+bz322GNasGCBYrFYt9EqXf+/cuXK+LZXX31V06ZNU0lJid588834OhY//elPddNNN6m6ulpnnHFG/DhdlixZoh/84Ae6/vrrdeutt+q///u/4+X48pe/HH/cww8/rEgkojvuuEOXXXaZ/vWvf+knP/nJgHUDAAAAJCIWiykajcb/tLW1acuWLbr66qsVCAT0xS9+Mf648847T8uWLdPll1+u+++/XxdddJGWL1+uH/3oR91e8+9//7tefvllXXvttbrtttu0d+9effe73+01HKirq9PZZ5+tvLw8Pfjgg/L5fD0eM3PmTI0ePbrX9kZeXp6OPfZYrV69WldeeaWOP/54/e53v9PVV1+tN998U5dddtmAdVBQUKDrr79e7733nu67774+H9fW1qZvfOMbevnll3XJJZdoyZIlKi0t1eLFi+MhxpIlS1RUVKRjjz1Wjz32mE499VQ5nU698cYb8dfpam+sWrUqvu3VV19VQUGBZsyYoU2bNum0005TVVWVrr32Wt1yyy2yWCz6xje+obfeeqtbme68806de+65+uUvf6ljjjlGF1xwQbwcn1xL8Pnnn9fGjRv185//XD/60Y/03nvv6ZJLLhmwbgAg1eypLgAAIDmeeuop5eXlaeHChZKkU089VXfeeaeefPJJnX/++QoGg/rd736nM888U5dffrkk6bOf/axCoVC8U1/a/0PX7Xbr97//vbKzsyVJRx99tD73uc/pvvvu0w9+8IMex3a73fr85z+v559/Pv4jOBAIaOnSpbrwwgslSW+99ZZGjRqlM888U1arVbNnz5bH41Fzc/Og3t8555yjl156STfeeKPmzp2rwsLCHo956KGH1NTUpD//+c8aOXKkJGn+/Pk66aSTdMcdd+g3v/mNDj/8cElSaWlp/P/HjRun5cuX68QTT1QoFNKaNWt0yCGHdAswXnvtNZ122mmSpFtvvVVjxozRvffeK5vNFq/L//qv/9JvfvMb3XHHHfHn/b//9/90wgknxP9eWloqSfFjd5k+fbp++ctfxut77dq1zF0LAAAAw61cuVKHHHJIt20Wi0WTJk3SHXfcocrKSklSbW2t3G63fvCDH2jWrFmSpDlz5mjHjh3d2hOSFI1Gdf/998fbE4FAQD/4wQ+0YcMGHXroofHHNTY26pvf/KZcLpcefPBB5ebm9lnORYsW6YEHHlA4HJbL5VIsFtPzzz+vE044QU6nU6tXr5bL5dK3v/1tOZ1OSVJeXp7WrVunWCzWbSRJb0466SS98MILWrJkiRYuXKiJEyf2eMzTTz+tDz74QI8//rhmzJghaX9746yzztItt9yip556StOmTZPT6VRBQUH8N//s2bO1fPlyLV68WJK0fPnyXtsbxx57rKxWq5YsWSKn06mHH344XocLFizQKaecol/+8pd68skn48878cQTdfrpp8f/Pnr0aEnS1KlTNWrUqPj2kpIS3X333XI4HJKk7du365577lFra2v8GABgRozAAIAMEIlE9Mwzz+hzn/ucwuGwWlpa5PV6deSRR+rxxx9XZ2en3nnnHYXD4W6d6ZJ0yimndPv7m2++qdmzZ8vlcsXvwsrOztasWbO63TX0aV/84he1Y8eO+NDwl19+We3t7Vq0aJEkae7cudq6datOO+00LVmyROvWrdMXvvAFnXXWWYN6jzabTT/72c8UDAZ1ww039PqY5cuXa+rUqSopKYmX3Wq1av78+f2WfcGCBfH9q1evlsPh0Ne//nWtXbtW7e3t2rRpU3xYezAY1Lp163TiiSfGwwtp/7y5lZWVPe6Imjp16qDe35FHHtnt76NGjep1+DwAAACQTIcccoiefPJJPfnkk7r77rs1adIkjR07Vrfffnu3tkNJSYkefvhhHXnkkaqqqtKyZcv0hz/8QW+//bba29u7veaECRO6dYqXlJRIkkKhULfHLV68WBs3btQPf/hD5ef
"text/plain": [
"<Figure size 1600x1200 with 4 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"\n",
"sns.set(style=\"whitegrid\")\n",
"\n",
"plt.figure(figsize=(16, 12))\n",
"\n",
"plt.subplot(2, 2, 1)\n",
"sns.scatterplot(x=df_cleaned['Age'], y=df_cleaned['Networth'], alpha=0.6)\n",
"plt.title('Age vs Networth')\n",
"\n",
"plt.subplot(2, 2, 2)\n",
"sns.scatterplot(x=df_cleaned['Rank'], y=df_cleaned['Networth'], alpha=0.6)\n",
"plt.title('Rank vs Networth')\n",
"\n",
"plt.subplot(2, 2, 3)\n",
"sns.scatterplot(x=df_cleaned['Age'], y=df_cleaned['Rank'], alpha=0.6)\n",
"plt.title('Age vs Rank')\n",
"\n",
"plt.subplot(2, 2, 4)\n",
"sns.scatterplot(x=df_cleaned['Networth'], y=df_cleaned['Rank'], alpha=0.6)\n",
"plt.title('Networth vs Rank')\n",
"\n",
"plt.tight_layout()\n",
"plt.show()\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Стандартизация данных для кластеризации"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Стандартизация данных — процесс приведения всех признаков (столбцов) к одному масштабу."
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Rank Networth Age Country_Algeria Country_Argentina \\\n",
"0 -1.738048 15.675541 -1.184396 -0.025828 -0.044766 \n",
"1 -1.735688 12.120704 -0.578567 -0.025828 -0.044766 \n",
"2 -1.733328 11.157936 0.557363 -0.025828 -0.044766 \n",
"3 -1.730968 9.010221 0.027262 -0.025828 -0.044766 \n",
"4 -1.728608 8.195571 1.920479 -0.025828 -0.044766 \n",
"\n",
" Country_Australia Country_Austria Country_Belgium Country_Belize \\\n",
"0 -0.147643 -0.068473 -0.025828 -0.025828 \n",
"1 -0.147643 -0.068473 -0.025828 -0.025828 \n",
"2 -0.147643 -0.068473 -0.025828 -0.025828 \n",
"3 -0.147643 -0.068473 -0.025828 -0.025828 \n",
"4 -0.147643 -0.068473 -0.025828 -0.025828 \n",
"\n",
" Country_Brazil ... Industry_Logistics Industry_Manufacturing \\\n",
"0 -0.124788 ... -0.119159 -0.344337 \n",
"1 -0.124788 ... -0.119159 -0.344337 \n",
"2 -0.124788 ... -0.119159 -0.344337 \n",
"3 -0.124788 ... -0.119159 -0.344337 \n",
"4 -0.124788 ... -0.119159 -0.344337 \n",
"\n",
" Industry_Media & Entertainment Industry_Metals & Mining \\\n",
"0 -0.193247 -0.175863 \n",
"1 -0.193247 -0.175863 \n",
"2 -0.193247 -0.175863 \n",
"3 -0.193247 -0.175863 \n",
"4 -0.193247 -0.175863 \n",
"\n",
" Industry_Real Estate Industry_Service Industry_Sports \\\n",
"0 -0.281312 -0.137919 -0.107067 \n",
"1 -0.281312 -0.137919 -0.107067 \n",
"2 -0.281312 -0.137919 -0.107067 \n",
"3 -0.281312 -0.137919 -0.107067 \n",
"4 -0.281312 -0.137919 -0.107067 \n",
"\n",
" Industry_Technology Industry_Telecom Industry_diversified \n",
"0 -0.387694 -0.113266 -0.265827 \n",
"1 2.579353 -0.113266 -0.265827 \n",
"2 -0.387694 -0.113266 -0.265827 \n",
"3 2.579353 -0.113266 -0.265827 \n",
"4 -0.387694 -0.113266 -0.265827 \n",
"\n",
"[5 rows x 85 columns]\n"
]
}
],
"source": [
"from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
"import pandas as pd\n",
"\n",
"# Проверяем наличие столбцов Country и Industry\n",
"if 'Country' in df_cleaned.columns and 'Industry' in df_cleaned.columns:\n",
" # Закодируем столбец Country с помощью One-Hot Encoding\n",
" encoder_country = OneHotEncoder(sparse_output=False)\n",
" country_encoded = encoder_country.fit_transform(df_cleaned[['Country']])\n",
" country_encoded_df = pd.DataFrame(country_encoded, columns=encoder_country.get_feature_names_out(['Country']))\n",
"\n",
" # Закодируем столбец Industry с помощью One-Hot Encoding\n",
" encoder_industry = OneHotEncoder(sparse_output=False)\n",
" industry_encoded = encoder_industry.fit_transform(df_cleaned[['Industry']])\n",
" industry_encoded_df = pd.DataFrame(industry_encoded, columns=encoder_industry.get_feature_names_out(['Industry']))\n",
"\n",
" # Удаляем оригинальные столбцы Country и Industry и добавляем закодированные столбцы\n",
" df_cleaned = df_cleaned.drop(columns=['Country', 'Industry'])\n",
" df_cleaned = pd.concat([df_cleaned, country_encoded_df, industry_encoded_df], axis=1)\n",
"\n",
" # Создаем экземпляр StandardScaler\n",
" scaler = StandardScaler()\n",
"\n",
" # Применяем стандартизацию к данным\n",
" data_scaled = scaler.fit_transform(df_cleaned)\n",
"\n",
" # Преобразуем стандартизированные данные обратно в DataFrame для удобства\n",
" df_scaled = pd.DataFrame(data_scaled, columns=df_cleaned.columns)\n",
"\n",
" # Выводим первые несколько строк стандартизированного DataFrame\n",
" print(df_scaled.head())\n",
"else:\n",
" print(\"Столбцы 'Country' и/или 'Industry' отсутствуют в DataFrame.\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Агломеративная (иерархическая) кластеризация"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Иерархическая кластеризация — метод машинного обучения, предназначенный для группировки объектов (точек данных) на основе их схожести или расстояния друг от друга. Основная идея заключается в создании структуры кластеров в виде дерева (дендрограммы), которое показывает, как объекты группируются на разных уровнях."
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA1AAAAJ0CAYAAAAcUcKlAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAADNs0lEQVR4nOzdd5xU1f3/8feUXZaFpfeiUgQVBJGmGJGmMQkawXyNsWv8ahRLLD81MRqNGv0moAFbJMYCdoWAvYCAKIgBFOyoWGjLLlvYpexOO78/1pmdmZ2dvdNndl/Px4MHd+69c+/ZO3fu3M8953yOzRhjBAAAAABokj3TBQAAAACAXEEABQAAAAAWEUABAAAAgEUEUAAAAABgEQEUAAAAAFhEAAUAAAAAFhFAAQAAAIBFBFAAAAAAYBEBFAAAAABYRAAFwLIbbrhBgwcPjvjvhhtuyHTxAATZvXu3Ro4cqY0bN2r37t265JJL9Mgjj2S6WACQ85yZLgCA3NK1a1fdd999IfMuu+yyDJUGQGPat2+v888/X6eddpqMMRo8eLD+7//+L9PFAoCcRwAFwDKv16vCwkIdccQRIfPz8/MzUyAAUV122WU6/fTTVVVVpQMPPFAOhyPTRQKAnEcTPgCWeTweFRQUWFp37dq1OuusszR8+HCNGTNG119/vcrLywPLFy5cqMGDB2vr1q0h75s0aVJIc0C3291os8HwbW3YsEHTpk3TsGHDdNJJJ+n1118P2XZ1dbXuvPNOTZkyRYcffrimTp2qF154ocH+w/ezdetWnX322brhhhv0z3/+U+PGjdPIkSN16aWXatu2bSHvX7Jkic444wyNGDFCQ4cO1Yknnqgnn3wysHzNmjWB7a5bty7kvU888YQGDx6sSZMmNSjPn/70p5B1d+/eraFDh2rw4MFas2aN5f035vnnn9f06dN1xBFHaNiwYfrlL3+p1157rcExjtRss7HP5+yzzw7Zx6uvvqrp06drxIgROuaYY3TzzTdr9+7dgeX33nuvBg8erBEjRsjlcoW894orrmjQVLS2tlZ/+9vfdNxxx2no0KE66aST9Oqrr4a8b9KkSbrnnnv017/+VaNHj9bYsWN13XXXqbKy0vLfH63p6sKFCwOfafDnUFZWplGjRkX8LAcPHqxDDjlEo0eP1uWXX66KiorAOoMHD9a9994bUjb/cYnnWEpSly5d1L9/f61atarJ5rbh+3rllVc0evRozZo1S1Lo+Rv+L7jcX3zxhS677DIdddRRGjJkiI499ljdfvvtqqmpCazjcrn0j3/8Q5MnT9awYcM0depU/ec//7F0zCVp+/btuvrqqzVmzBgNHz5c5557rj777LPA9rdu3arBgwfrlVde0e9+9zsNHz5cEyZM0P333y+fzxfyuYQfk6uvvjrkMzXGaPbs2Tr22GM1cuRI/e53v9OOHTsC63u9Xs2dO1dTp07VsGHDdMQRR+j000/X+++/H/VzlBp+5uGvjTE6/fTTQ66XN9xwQ8i5JUnPPPNMxPMHQPJRAwXAsv3796t9+/ZNrvff//5X559/vo466ij94x//0O7duzV79mydc845euGFFywHYVLdTbIkPfjgg+rUqZOkupvd8MBHki6++GKdddZZuuqqq/TCCy/o97//vR566CEdd9xxqqmp0RlnnKGysjJdccUV6t27t5YsWaIbb7xRu3bt0u9+97vAdo477jhdeumlgdfdunWTJC1dulQdO3bUn/70J/l8Ps2aNUtnn322XnnlFbVu3VrLly/XjBkzdM455+jyyy9XTU2NnnrqKf3lL3/R0KFDNXz48MA227Rpo7ffflsjR44MzHv11Vdltzd8rtWmTRstX75cxhjZbDZJ0ptvvimv1xuyXiz7D/bkk0/q9ttv1+WXX66RI0dq9+7d+te//qVrr71WI0aMUI8ePQLr3nffferataskBT4PSfrVr36l//mf/wm8vvXWW0P28cADD2jOnDk644wzdNVVV2nLli2aPXu2PvroIz333HMh54TNZtPq1at13HHHSZL27t2rFStWhBwbY4xmzJih9evX64orrtCAAQP01ltv6aqrrpLL5dIpp5wSWPepp57SgQceqDvvvFPl5eWaNWuWvv/+ez3zzDOy2WxN/v2XXnqpTj/9dEl1NTqHHXZY4Pw44IAD9NVXXzU4prNmzVJ1dbXatWsXMt9/brndbn3zzTf629/+pjvuuEMzZ86M+NlEEsux9HO73frrX/9qeR+SVFNTo7/85S+68MILddJJJ4Usu/nmmzVkyJDA61//+teB6ZKSEp155pk64ogjdNdddyk/P1/vvPOOHn30UXXr1k0XXXSRJOnaa6/VihUrdMkll2j48OFasWKFbrjhBuXl5TV5zMvLy3X66aerdevWuummm9S6dWs9/vjjOvPMM/XCCy9owIABgfLccsstOu6443Tvvfdq3bp1uu+++7Rv3z79v//3/yL+3WvXrtUrr7wSMu+xxx7TQw89pOuuu079+vXTXXfdpSuvvFLPPfecJGnmzJl6+umndc0112jw4MHauXOn7r//fl155ZVavny5WrduHdOxD7Z48WJ9+OGHUdfZvXu3/vGPf8S9DwCxIYACYFllZWUgmIhm1qxZ6tevnx566KFAk6Hhw4frF7/4hRYsWKAzzzzT8j737dsnSRoxYoQ6duwoSVq5cmXEdc8++2zNmDFDknTsscdq2rRpuv/++3Xcccdp4cKF2rRpk5555hmNGDEisI7H49EDDzyg008/XR06dJBUFxiEN1OU6gLIhQsXqm/fvpKk/v37a9q0aVq0aJF+85vf6Ouvv9a0adN04403Bt4zYsQIjR07VmvWrAkJYMaPH6+lS5cGbuKKi4v14YcfatSoUQ1qtY4++mitWLFCGzZsCJTrtdde0+jRo0NqPWLZf7AtW7bot7/9bUjQ2Lt3b02fPl3r1q3TL37xi8D8Qw89VH369GmwjR49eoQcs7Zt2wamd+/erQcffFCnnXaabr755sD8QYMG6cwzz2xwTviPjT+Aevvtt9W1a9eQWoNVq1Zp5cqVuueee/Tzn/9cUt3nuX//fs2cOVNTp06V01n3E2e32/Xoo4+qqKhIUt3nO2PGDK1cuVLjx4+39PcfcMABkuqaqzZ2fvh9/PHHWrx4sQ499FBVVVWFLAt+7+jRo7Vq1Sp9+umnjW4rXKzH0m/+/Pnat2+funTpYnlfL7/8svLy8nThhRc2aPo3cODARo/Bpk2bdOihh2r27NmB82DcuHF67733tGbNGl100UXatGmT3njjDf3xj3/UueeeK6nuPN+2bZvWrFmjqVOnRj3m99xzjyorK/X000+rd+/ekurOm5///OeaPXu25syZE1h3yJAhgQB1/Pjx2rdvnx5//HFdcsklIeepJPl8Pt1+++0aMmRIyOeyb98+XXrppTrvvPMk1dVu/eUvf1FVVZXatWunkpISXXXVVSG1rq1atdLll1+uL7/8Mur5Es3evXs1c+bMBuUJN2fOHPXq1SukNhNA6tCED4BlJSUl6t69e9R19u/frw0bNui4446TMUYej0cej0d9+/bVgAED9N5774Ws7/P5Aut4PJ4G2ysuLpbdbm9woxPJtGnTAtM2m03HH3+8Nm7cqJqaGn3wwQfq3bt3IHjyO/nkk1VbW6sNGzY0uf0jjzwyEDxJ0mGHHaa+ffvqv//9ryTpwgsv1F133aW9e/fqk08+0auvvqqHHnpIkho0SZs0aZK+++47bd68WZL0+uuva/jw4YGbwWBFRUUaM2aMli5dKkkqLy/XmjVrQgKbWPcf7IYbbtC1116rqqoqffTRR1q8eHGg2V+091n10UcfyeVyaerUqSHzR40apd69e+uDDz4ImT958mS9/fbbMsZIqquZ8wdJfqtXr5bNZtNxxx0Xcv5MmjRJpaWlIbVCkyZNCgRP/tdOpzPwuSXz7zfG6Pbbb9evfvUrHXLIIRGXezweuVwubdy4UevWrdPQoUND1gn/TgQHjrEeS0natWuX7r//fl1//fVq1aqVpb9j586d+te//qUzzjgj5n5TP/nJT/T
"text/plain": [
"<Figure size 1000x700 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2 2 2 ... 4 4 2]\n"
]
}
],
"source": [
"linkage_matrix = linkage(data_scaled, method='ward')\n",
"plt.figure(figsize=(10, 7))\n",
"dendrogram(linkage_matrix)\n",
"plt.title('Дендрограмма агломеративной кластеризации')\n",
"plt.xlabel('Индекс образца')\n",
"plt.ylabel('Расстояние')\n",
"plt.show()\n",
"\n",
"# Получение результатов кластеризации с заданным порогом\n",
"result = fcluster(linkage_matrix, t=56, criterion='distance')\n",
"print(result) # Вывод результатов кластеризации"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Визуализация распределения кластеров"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABjAAAASgCAYAAABWngGUAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3ib5fX/8Y/2sOU9s/eCbLIombSUEWgDpYUvUKCsUEbLLJS9oQUKPwK0lDALZYWywh5lhAwSkpBJ9o73tmTt3x9pDI7tQGI9kmy/X9eVK/FzS7qPTmxLR+e5n9sUjUajAgAAAAAAAAAASCLmRAcAAAAAAAAAAACwLxoYAAAAAAAAAAAg6dDAAAAAAAAAAAAASYcGBgAAAAAAAAAASDo0MAAAAAAAAAAAQNKhgQEAAAAAAAAAAJIODQwAAAAAAAAAAJB0aGAAAAAAAAAAAICkQwMDANApRaPRTjk3AAAAAOMl+j1/oucHgFihgQEAHcwVV1yhgQMH6oknnkh0KG22cOFCDRw4UL///e9bHH/11Vc1cOBA7dix44Ae95FHHtHs2bNjEeIBW79+vU499dQmxwYOHKiHHnooIfEAAACg8zrjjDM0cODAJn8GDRqkUaNG6cQTT9Trr79uyLwPPfSQBg4caMhjH4gdO3Zo4MCBOvHEExUKhZqN761HFi5ceECP+/LLL+uee+6JVZgHpKioSOeff7527tzZeGzatGm65pprEhIPALQVDQwA6EBqa2v14YcfasCAAXrxxRc7zFk3H330kd54442YPd6DDz4on88Xs8c7EO+++66WLl2akLkBAACAfQ0ZMkQvvvhi45/nnntOt912mywWi66++mp9+umniQ7RcKtWrdI///nPmD3eo48+qqqqqpg93oH48ssvO8X/GYDOgwYGAHQgb731liTpuuuu05YtW7RgwYIERxQbaWlpuuOOO1RWVpboUAAAAIAOJTU1VSNGjGj8M3r0aB133HF64oknZLPZ9OqrryY6RMOlpaXp4Ycf1vr16xMdCgBgHzQwAKADmTNnjiZMmKDx48erZ8+eeuGFF5rdZvbs2TryyCM1bNgwnXLKKfr444+bLYtet26dLrjgAo0aNUqjRo3SRRddpO3bt7c675tvvqmBAwdq3bp1TY5/+OGHGjhwoFavXi1Jevrpp3X00Udr6NChmjhxom6++WbV1dX94PO67LLL5PV6dfPNN//gbXft2qXLL79cY8eO1fDhw3XmmWc2zi+pcan6rFmzNHDgQD3zzDMaNGiQKisrG2/z8MMPa+DAgZo/f36T5zJo0CAVFxdLklasWKFzzjlH48aN06hRozRz5swmBc/e5eYvvPCCpk6dqlGjRumUU07RrFmzGuP4/mWj6urqdN1112ns2LEaOXKkLr30Uho2AAAASBiHwyG73S6TydR4rKKiQrfccoumTp2qQw89VGPHjtVFF13U5JKuZ5xxhq677jo99thjmjJlioYOHapTTjlF33zzTatz7dq1S1OmTNGJJ56ompqaZuNff/21Bg4cqE8++aTJ8TVr1mjgwIH64IMPJO05oeuEE07QsGHDNH78eF155ZWN79/354ILLlBqaqquueYahcPh/d62qqpKN954ow4//HANHTpUv/71r5vUDdOmTdPOnTv1n//8p7He+H5NJEmvvfaaBg4cqJdffrnZc9m7WnvLli269NJL9ZOf/EQjRozQGWecoSVLljTefu/lr5588kkdffTRGj58uObMmaNrr71WknTkkUc2uWxUMBjUX/7yl8bH+93vfqetW7f+YG4AINFoYABAB7F+/XqtWLFCv/zlLyVJv/zlL/XRRx81+RB81qxZuvfee3XMMcfokUce0fDhw/XHP/6xyeNs3rxZp5xyisrLy3XPPffojjvu0Pbt23XqqaeqvLy8xbl/+tOfyu12a+7cuU2Ov/XWW+rfv7+GDBmit956S3/961912mmnafbs2brooov0+uuv67bbbvvB59a3b19dcskl+uCDDxpXmbSkoqJCp5xyilatWqUbbrhB9913nyKRiE477TRt3LhRkvTiiy9Kkn71q1/pxRdf1JQpUxSNRpusVtn776+++qrx2GeffaYhQ4YoPz9fCxYsaNzH4s4779Ttt9+u3bt365RTTmmcZ69Zs2bpT3/6k2688Ubdd999+tWvftUYx8knn9x4u2eeeUbBYFAPPvigrrjiCn388ce69dZbfzA3AAAAQFtEo1GFQqHGP36/X5s2bdK1116r+vp6/eIXv2i83QUXXKB58+bpyiuv1OzZs3XxxRdr/vz5uummm5o85nvvvaePPvpI119/ve6//36VlZXpkksuabE5UFpaqrPOOksZGRl68sknlZaW1uw2o0aNUo8ePVqsNzIyMjR58mQtWbJEV199tY466ij985//1LXXXqsFCxboiiuu+MEcZGVl6cYbb9TKlSv1+OOPt3o7v9+vM888Ux999JEuu+wyzZo1SwUFBTr33HMbmxizZs1Sbm6uJk+erBdffFEzZsyQ3W7Xl19+2fg4e+uNxYsXNx777LPPlJWVpeHDh2vDhg068cQTtWPHDl1//fW69957ZTKZdOaZZ2rRokVNYnrooYd03nnn6S9/+YsOP/xwXXjhhY1xfH8vwbffflvr16/X3XffrZtuukkrV67UZZdd9oO5AYBEsyY6AABAbMyZM0cZGRmaNm2aJGnGjBl66KGH9Morr2jmzJnyer365z//qdNOO01XXnmlJOmII46Qz+dr/FBf2vNG1+Vy6amnnlJqaqokacKECfrpT3+qxx9/XH/605+aze1yufTzn/9cb7/9duOb4Pr6en3yySe66KKLJEmLFi1St27ddNppp8lsNmvs2LFyu92qrq7+Uc/vnHPO0QcffKDbbrtN48ePV05OTrPbPP3006qqqtK///1vde3aVZI0adIkHXvssXrwwQf1//7f/9OIESMkSQUFBY3/7t27t+bPn69jjjlGPp9PS5cu1SGHHNKkgfH555/rxBNPlCTdd9996tmzpx577DFZLJbGXP7sZz/T//t//08PPvhg4/3+7//+T0cffXTj1wUFBZLUOPdeQ4cO1V/+8pfGfC9fvpxr1wIAAMBwX331lQ455JAmx0wmkwYMGKAHH3xQU6dOlSSVlJTI5XLpT3/6kw477DBJ0rhx47Rt27Ym9YQkhUIhzZ49u7GeqK+v15/+9CetWbNGhx56aOPtKisrdfbZZ8vpdOrJJ59Uenp6q3GecMIJeuKJJ9TQ0CCn06loNKq3335bRx99tOx2u5YsWSKn06nzzz9fdrtdkpSRkaEVK1YoGo02WUnSkmOPPVbvvPOOZs2apWnTpql///7NbvP6669r7dq1eumllzR8+HBJe+qNM844Q/fee6/mzJmjIUOGyG63Kysrq/E9/9ixYzV//nyde+65kqT58+e3WG9MnjxZZrNZs2bNkt1u1zPPPNOYwylTpmj69On6y1/+oldeeaXxfsccc4xOOumkxq979OghSRo8eLC6devWeDw/P1+PPPKIbDabJGnr1q169NFHVVdX1zgHACQjVmAAQAcQDAb1xhtv6Kc//akaGhpUU1OjlJQUjR49Wi+99JIikYiWLVumhoaGJh+mS9L06dObfL1gwQKNHTtWTqez8Sys1NRUHXbYYU3OGtrXL37xC23btq1xafhHH32kQCCgE044QZI0fvx4bd68WSeeeKJmzZqlFStW6Pjjj9cZZ5zxo56jxWLRXXfdJa/Xq1tuuaXF28yfP1+DBw9Wfn5+Y+xms1mTJk3ab+xTpkxpHF+yZIlsNpt++9vfavny5QoEAtqwYUPjsnav16sVK1bomGOOaWxeSHuumzt16tRmZ0QNHjz4Rz2/0aNHN/m6W7duLS6fBwAAAGLpkEMO0SuvvKJXXnlFjzzyiAYMGKBevXrpgQceaFI75Ofn65lnntHo0aO1Y8cOzZs3T88++6y+/vprBQKBJo/Zr1+/Jh+K5+fnS5J8Pl+T25177rlav369/vznPyszM3O/cZ5wwgnyer2Nl5H6+uuvtWvXrsYVImPGjJHP59P
"text/plain": [
"<Figure size 1600x1200 with 4 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.figure(figsize=(16, 12))\n",
"\n",
"plt.subplot(2, 2, 1)\n",
"sns.scatterplot(x=df_cleaned['Age'], y=df_cleaned['Networth'], hue=result, palette='Set1', alpha=0.6)\n",
"plt.title('Age vs Networth')\n",
"\n",
"plt.subplot(2, 2, 2)\n",
"sns.scatterplot(x=df_cleaned['Rank'], y=df_cleaned['Networth'], hue=result, palette='Set1', alpha=0.6)\n",
"plt.title('Rank vs Networth')\n",
"\n",
"plt.subplot(2, 2, 3)\n",
"sns.scatterplot(x=df_cleaned['Age'], y=df_cleaned['Rank'], hue=result, palette='Set1', alpha=0.6)\n",
"plt.title('Age vs Rank')\n",
"\n",
"plt.subplot(2, 2, 4)\n",
"sns.scatterplot(x=df_cleaned['Networth'], y=df_cleaned['Rank'], hue=result, palette='Set1', alpha=0.6)\n",
"plt.title('Networth vs Rank')\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# KMeans (неиерархическая кластеризация) для сравнения"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Неиерархическая кластеризация — метод группировки данных, при котором объекты распределяются по заданному числу кластеров(в нашем случае - \n",
"𝑘 в методе K-Means), основываясь на определенных метриках расстояния или схожести. В отличие от иерархической кластеризации, которая создает древовидную структуру кластеров, неиерархическая работает с фиксированным количеством кластеров и напрямую распределяет объекты в группы.\n",
"\n",
"K-Means:\n",
"* Один из самых популярных методов.\n",
"* Делит данные на 𝑘 кластеров, минимизируя сумму квадратов расстояний от каждой точки до её центроида.\n",
"* Центроиды обновляются итеративно, пока результат не стабилизируется."
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Центры кластеров:\n",
" [[ 7.20926316e+02 6.21684211e+00 6.75894737e+01 7.58941521e-19\n",
" 8.67361738e-19 0.00000000e+00 4.33680869e-18 1.05263158e-02\n",
" 1.05263158e-02 -1.38777878e-17 2.42861287e-17 5.26315789e-02\n",
" 1.11022302e-16 4.21052632e-02 -1.30104261e-18 -2.60208521e-18\n",
" -1.73472348e-18 2.16840434e-18 1.08420217e-19 -5.42101086e-19\n",
" 3.46944695e-18 1.05263158e-02 7.78947368e-01 -4.33680869e-19\n",
" 1.05263158e-02 -3.46944695e-18 -1.08420217e-19 2.77555756e-17\n",
" 1.73472348e-18 5.20417043e-18 -8.67361738e-18 -1.38777878e-17\n",
" -1.38777878e-17 4.21052632e-02 4.33680869e-19 3.25260652e-19\n",
" 2.60208521e-18 7.80625564e-18 4.33680869e-19 5.20417043e-18\n",
" 2.10526316e-02 -1.73472348e-18 -4.33680869e-18 0.00000000e+00\n",
" 1.05263158e-02 3.46944695e-18 0.00000000e+00 -2.16840434e-19\n",
" 4.33680869e-19 -5.42101086e-19 -6.93889390e-18 1.73472348e-18\n",
" 1.30104261e-18 -8.67361738e-18 0.00000000e+00 6.93889390e-18\n",
" 1.04083409e-17 -1.04083409e-17 1.73472348e-18 1.73472348e-18\n",
" -5.42101086e-19 2.16840434e-18 6.93889390e-18 -2.22044605e-16\n",
" 1.05263158e-02 -2.16840434e-18 -1.08420217e-19 1.04083409e-17\n",
" 1.05263158e-02 1.05263158e-02 2.31578947e-01 2.00000000e-01\n",
" 5.26315789e-02 1.05263158e-02 1.36842105e-01 1.05263158e-02\n",
" 1.05263158e-01 5.26315789e-02 4.21052632e-02 3.15789474e-02\n",
" 1.05263158e-02 -8.67361738e-18 8.42105263e-02 1.73472348e-18\n",
" 1.05263158e-02]\n",
" [ 7.01340909e+02 1.23954545e+01 6.47045455e+01 1.08420217e-19\n",
" -4.33680869e-19 2.27272727e-02 -1.73472348e-18 1.08420217e-19\n",
" 0.00000000e+00 0.00000000e+00 1.04083409e-17 4.33680869e-19\n",
" 2.72727273e-01 1.30104261e-18 -6.50521303e-19 -8.67361738e-19\n",
" -1.73472348e-18 1.30104261e-18 -2.16840434e-19 0.00000000e+00\n",
" 0.00000000e+00 0.00000000e+00 9.09090909e-02 4.33680869e-19\n",
" -2.16840434e-19 9.09090909e-02 3.25260652e-19 1.59090909e-01\n",
" 0.00000000e+00 -2.60208521e-18 2.27272727e-02 4.54545455e-02\n",
" -1.73472348e-18 0.00000000e+00 -8.67361738e-19 -2.16840434e-19\n",
" 8.67361738e-19 0.00000000e+00 2.16840434e-19 1.73472348e-18\n",
" -6.50521303e-19 -4.33680869e-19 0.00000000e+00 4.33680869e-19\n",
" 0.00000000e+00 -1.73472348e-18 0.00000000e+00 -1.08420217e-19\n",
" 2.16840434e-19 0.00000000e+00 1.04083409e-17 0.00000000e+00\n",
" -8.67361738e-19 2.27272727e-02 -8.67361738e-19 -1.73472348e-18\n",
" 2.27272727e-02 3.46944695e-18 0.00000000e+00 1.30104261e-18\n",
" 0.00000000e+00 2.27272727e-02 -1.38777878e-17 2.27272727e-01\n",
" 0.00000000e+00 8.67361738e-19 4.33680869e-19 1.00000000e+00\n",
" -1.73472348e-18 0.00000000e+00 4.16333634e-17 -2.77555756e-17\n",
" 0.00000000e+00 1.73472348e-18 2.77555756e-17 3.46944695e-18\n",
" -5.55111512e-17 -6.93889390e-18 1.04083409e-17 0.00000000e+00\n",
" 1.04083409e-17 1.73472348e-18 0.00000000e+00 1.73472348e-18\n",
" 2.77555756e-17]\n",
" [ 7.33294118e+02 4.89411765e+00 7.17647059e+01 -1.08420217e-19\n",
" 4.33680869e-19 0.00000000e+00 0.00000000e+00 0.00000000e+00\n",
" -1.08420217e-19 0.00000000e+00 0.00000000e+00 4.33680869e-19\n",
" -2.77555756e-17 0.00000000e+00 2.16840434e-19 2.60208521e-18\n",
" 0.00000000e+00 0.00000000e+00 2.16840434e-19 -1.08420217e-19\n",
" 0.00000000e+00 0.00000000e+00 0.00000000e+00 4.33680869e-19\n",
" -1.08420217e-19 3.46944695e-18 -1.08420217e-19 6.93889390e-18\n",
" -1.73472348e-18 8.67361738e-19 1.73472348e-18 3.46944695e-18\n",
" -1.73472348e-18 4.33680869e-19 0.00000000e+00 -1.08420217e-19\n",
" -1.73472348e-18 0.00000000e+00 -1.08420217e-19 8.67361738e-19\n",
" 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00\n",
" 1.08420217e-19 1.73472348e-18 8.67361738e-19 1.08420217e-19\n",
" -1.08420217e-19 1.08420217e-19 0.00000000e+00 -1.73472348e-18\n",
" -4.33680869e-19 1.73472348e-18 -8.67361738e-19 -1.73472348e-18\n",
" 0.00000000e+00 -3.46944695e-18 0.00000000e+00 -4.33680869e-19\n",
" -1.08420217e-19 -4.33680869e-19 5.88235294e-02 9.41176471e-01\n",
" -1.08420217e-19 -4.33680869e-19 0.00000000e+00 -3.46944695e-18\n",
" -1.73472348e-18 -6.93889390e-18 0.00000000e+00 -2.77555756e-17\n",
" 0.00000000e+00 0.00000000e+00 0.00000000e+00 -1.73472348e-18\n",
" 0.00000000e+00 -6.93889390e-18 0.00000000e+00 0.00000000e+00\n",
" 0.00000000e+00 1.00000000e+00 2.77555756e-17 1.73472348e-18\n",
" 0.00000000e+00]\n",
" [ 7.39883185e+02 7.28195685e+00 6.54553571e+01 7.44047619e-04\n",
" 2.23214286e-03 2.30654762e-02 5.20833333e-03 -3.25260652e-18\n",
" -4.66206934e-18 1.71130952e-02 2.82738095e-02 3.20923843e-17\n",
" 1.71875000e-01 -3.20923843e-17 1.48809524e-03 5.20833333e-03\n",
" 4.46428571e-03 2.97619048e-03 7.44047619e-04 7.44047619e-04\n",
" 2.45535714e-02 -3.68628739e-18 4.02455846e-16 2.23214286e-03\n",
" -2.92734587e-18 3.12500000e-02 7.44047619e-04 6.17559524e-02\n",
" 1.11607143e-02 6.69642857e-03 1.19047619e-02 1.78571429e-02\n",
" 1.41369048e-02 -3.33934269e-17 2.23214286e-03 7.44047619e-04\n",
" 5.95238095e-03 7.44047619e-03 7.44047619e-04 4.46428571e-03\n",
" 1.66967135e-17 2.23214286e-03 5.20833333e-03 7.44047619e-04\n",
" -3.36102673e-18 6.69642857e-03 2.97619048e-03 7.44047619e-04\n",
" 7.44047619e-04 7.44047619e-04 2.75297619e-02 1.19047619e-02\n",
" 2.97619048e-03 1.19047619e-02 8.18452381e-03 1.63690476e-02\n",
" 2.38095238e-02 2.00892857e-02 8.92857143e-03 3.72023810e-03\n",
" 7.44047619e-04 1.48809524e-03 2.30654762e-02 3.46726190e-01\n",
" -4.66206934e-18 3.72023810e-03 7.44047619e-04 1.38777878e-16\n",
" 1.26488095e-02 4.31547619e-02 9.52380952e-02 1.59970238e-01\n",
" 8.70535714e-02 1.11607143e-02 7.14285714e-02 1.48809524e-02\n",
" 1.10863095e-01 3.64583333e-02 3.05059524e-02 7.96130952e-02\n",
" 2.00892857e-02 -3.81639165e-17 1.39880952e-01 1.41369048e-02\n",
" 7.29166667e-02]]\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABjAAAASgCAYAAABWngGUAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3hb5fn/8Y/2sLx3nOHE2SSEBMgAkpBAIYzSAoXSL7M/wiqU1RL2TigtCVAICRCgpGW1jDLKLKOsbwZhNWTvbTuOty3LWr8/8rVA2M6yjiTb79d1cRGf5+g8t2/Zlm7d5znHFA6HwwIAAAAAAAAAAEgi5kQHAAAAAAAAAAAA8GM0MAAAAAAAAAAAQNKhgQEAAAAAAAAAAJIODQwAAAAAAAAAAJB0aGAAAAAAAAAAAICkQwMDAAAAAAAAAAAkHRoYAAAAAAAAAAAg6dDAAAAAAAAAAAAASYcGBgBgn4TD4USHgARJ5HPPzx0AAEDnxvs57A31BoA9oYEBIGmde+65Ovfcc1ttr6+v15lnnqlhw4bp/fffj+w7aNAgnXXWWe0e75prrtGgQYN0ww03GBazUXw+n55++mmdfvrpOvTQQzV69GidddZZevXVV6PecD388MMaNGhQTOdubm7WPffcozfeeCMmx2vveY2nUCiko48+WoMGDdJ3332X0FhipeW5f/rpp9scv+GGGzR58uT9OmZtba2mTZumJUuWxCDC/ffBBx/o+uuvj3y9aNEiDRo0SIsWLUpIPAAAoHOgjvgedURsbN26VYMGDWr137Bhw3TkkUfqiiuu0IYNGwyZe/LkyUnxs0e9ASBRrIkOAAD2R319vaZOnaqVK1fqkUce0cSJEyNjZrNZ33zzjUpLS1VQUBD1uMbGRn300UfxDjcmKioqNHXqVO3YsUPnnnuuDj74YIVCIX300Ue64YYbtGTJEt19990ymUyGzF9eXq758+frD3/4Q0yOd/vtt8fkOB3x+eefq6KiQv369dMLL7yg6dOnJzqkmHnggQc0adIk9enTp8PHWrFihV577TWdfvrpMYhs/7VXHAEAAOwv6gjqiFi47LLLdPTRR0e+9nq9WrZsmR599FH9v//3//TOO+/I4XAkLsA4oN4AEG80MAB0Gi1Fx4oVKzR37lwdeeSRUeNDhw7V2rVr9c477+iCCy6IGvvoo4/kcrmUlpYWx4hj4/rrr1dpaan+/ve/q7i4OLL96KOPVo8ePXT//fdr0qRJOuaYYxIX5H7o379/okPQK6+8opEjR2r8+PGaO3eubrjhBnk8nkSHFRN2u1033XSTnnnmGcOKUQAAgM6EOoI6IlZ69+6tQw45JGrbuHHjlJKSojvuuEMLFy6Mao51RdQbAOKNS0gB6BQaGhp00UUXadWqVXr88cdbFR2S5Ha7NXHiRL3zzjutxt566y0df/zxslqj+7ahUEiPP/64fvKTn2jYsGE6/vjj9be//S1qn2AwqMcff1wnn3yyDj74YB1yyCE666yztHDhwsg+Dz/8sH7yk5/oP//5j376059GjvXqq69GHWv+/PmaMmWKhg8frvHjx+uOO+5QfX19u9/3ihUr9Nlnn+nCCy+MKjpaXHDBBTr77LPldrvbfHxby41feeUVDRo0SFu3bpUkNTU16Y477tCECRM0bNgwTZkyRU8++aSk3UulWwqaG2+8MWpJ8JIlS3TOOedoxIgRGj16tK6//npVVlZGzTN06FC9+OKLOvLIIzV69GitXbu21dLvQYMG6dlnn9XNN9+s0aNHa+TIkbrqqqtUUVERFfeTTz6pY445RgcffLDOOussffjhh1HLe1uWdT/88MPt5lOSampq9P7772vSpEk6+eST5fV69dprr7Xar76+XrfddpvGjRunkSNH6pprrtHTTz/damn9+++/r9NOO03Dhw/XkUceqenTp6uxsbHd+W+99VYdeeSRCgaDUdtnzJihMWPGyO/37/E52ZuWs+n++te/7nXfPT2HixYt0nnnnSdJOu+883Tuuefqiiuu0CmnnBJ1jPPPP1/Dhg1TU1NT1Pdy/PHHR75+6623dNppp2nkyJE68sgjddttt6mmpiYy3vL7M3v2bI0ePVpHHXWUTjnlFC1evFiLFy9utYx7/fr1uvDCCzVixAgdeeSRmjlzpgKBwD7lBwAAdC/UEdQRsaoj9qStBtcXX3yhCy+8UIcffriGDRumyZMn6+GHH1YoFIqa9+2339aVV16pkSNHavTo0brlllv2WE+89NJLGjx4sB555JE2x6k3qDeAroYGBoCk19jYqIsvvljLly/XvHnzNGbMmHb3PfHEEyPLv1vU19frk08+0cknn9xq/zvuuEMPPfSQTjnlFD366KOaMmWK7rnnnqg3gzNnztScOXP0y1/+Uk888YTuvvtuVVdX66qrrpLX643st3PnTt11110677zz9Pjjj6tnz566/vrrtW7dOknSv/71L9133306++yz9eSTT+ryyy/Xa6+9prvvvrvd7+fTTz+VpHavJepwOCIfsh+oe+65R5988omuv/76yJv7P/3pT3r55ZeVl5en2bNnS9q9XLrl31988YUuuOACOZ1OPfjgg7rpppu0ePFinXfeeVFvLIPBoJ566inNmDFDN954o0pKStqM4YEHHlAoFNL999+vadOm6aOPPtI999wTGZ89e7ZmzpypE044QXPmzNGIESN09dVXRx0jLy9Pf//733XGGWfs8ft94403FAwG9dOf/lQ9evTQ2LFj9fe//73Vfr/5zW/09ttv67e//a0eeOABNTQ0aNasWa2Odfnll6tfv3565JFHdMUVV+j111/Xb37zm3ZvBvezn/1MFRUVUW+QQ6GQ3n77bZ100kmy2Wx7fE725vTTT9eECRP0wAMPaPPmze3ut7fn8KCDDtJtt90mSbrtttt0++23a+LEiVq9erV27dolafc1lb/++mv5/X598803kWN/8sknmjRpkiRpzpw5uvbaa3XIIYfooYce0uWXX653331X5557btTPyvbt2/Xxxx/rgQce0I033qj7779fQ4cO1dChQ/X3v/9dBx10UGTfP/zhDzr00EP16KOP6oQTTtC8efP0wgsv7DU3AACge6GOoI6IZR0h7X7fHggEIv/V19fr888/16xZs1RUVKTDDjtMkrRy5UpdcMEFysjI0AMPPKC5c+fqsMMO0+zZs/X2229HHfP2229XUVGR5syZowsvvFAvvfSS5s6d2+b8b731lm699Vb95je/0eWXX97mPtQb1BtAV8MlpAAktZai48svv4x8vSdHH320XC5X1PLvf//738rOztahhx4ate+GDRv0j3/8Q9dee60uvvhiSdJRRx0lk8mkxx57TP/zP/+jzMxMlZeX65prrok628fhcOi3v/2tVq1aFVlC7PV6NWPGjEgRUFxcrEmTJunjjz9WSUmJFi9erJ49e+rss8+W2WzW6NGj5Xa7o84M+bEdO3ZIknr27LnvSdtPixcv1pFHHqmTTjpJkjRmzBi53W5lZ2fLbrdryJAhknYvlx46dKgkadasWerbt68ee+wxWSwWSdKIESN00kkn6eWXX9bZZ58dOf6ll14adZ3YtgwcODDq2rj//e9/I2fANTY2at68eTr77LP1+9//XtLu58nr9UY1Hux2e6vl3G155ZVXNGHCBOXm5kqSTjvtNF133XX66quvNGrUKEnSggULtGjRIj388MM67rjjJEkTJkzQySefHCkkw+GwZs6cqfHjx2vmzJmR4xcXF+uCCy7Qxx9/3Ob3feihh6qoqEj/+te/dMQRR0jaffbRzp079bOf/UzSnp+TfXH33Xfr5JNP1k033aS//e1vbS7t3pfnsGWZfv/+/dW/f//ImWULFizQySefrK+++koWi0V9+/bVF198obFjx2rLli3auHGjJk2apJqaGs2dO1dnnnlmpDiRdj/fZ599dtTPSiAQ0PXXXx8p+iRFLuv14+f1vPPO029+8xtJ0tixY/X+++9r4cKFOuecc/YpPwAAoOujjqCOiHUdIUk333yzbr755qhtbrdbRx55pK6//nqlpKRI2t3
"text/plain": [
"<Figure size 1600x1200 with 4 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Параметры для KMeans\n",
"random_state = 17\n",
"n_clusters = 4\n",
"\n",
"# Применение KMeans\n",
"kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)\n",
"labels = kmeans.fit_predict(data_scaled)\n",
"centers = kmeans.cluster_centers_\n",
"\n",
"# Обратная стандартизация центроидов\n",
"centers = scaler.inverse_transform(centers)\n",
"print(\"Центры кластеров:\\n\", centers)\n",
"\n",
"# Добавление меток кластеров в DataFrame для удобства\n",
"df_cleaned['Cluster'] = labels\n",
"\n",
"# Визуализация результатов кластеризации KMeans\n",
"plt.figure(figsize=(16, 12))\n",
"\n",
"plt.subplot(2, 2, 1)\n",
"sns.scatterplot(x=df_cleaned['Age'], y=df_cleaned['Networth'], hue=df_cleaned['Cluster'], palette='Set1', alpha=0.6)\n",
"plt.scatter(centers[:, 2], centers[:, 1], s=300, c='red', label='Centroids')\n",
"plt.title('KMeans Clustering: Age vs Networth')\n",
"plt.legend()\n",
"\n",
"plt.subplot(2, 2, 2)\n",
"sns.scatterplot(x=df_cleaned['Rank'], y=df_cleaned['Networth'], hue=df_cleaned['Cluster'], palette='Set1', alpha=0.6)\n",
"plt.scatter(centers[:, 0], centers[:, 1], s=300, c='red', label='Centroids')\n",
"plt.title('KMeans Clustering: Rank vs Networth')\n",
"plt.legend()\n",
"\n",
"plt.subplot(2, 2, 3)\n",
"sns.scatterplot(x=df_cleaned['Age'], y=df_cleaned['Rank'], hue=df_cleaned['Cluster'], palette='Set1', alpha=0.6)\n",
"plt.scatter(centers[:, 2], centers[:, 0], s=300, c='red', label='Centroids')\n",
"plt.title('KMeans Clustering: Age vs Rank')\n",
"plt.legend()\n",
"\n",
"plt.subplot(2, 2, 4)\n",
"sns.scatterplot(x=df_cleaned['Networth'], y=df_cleaned['Rank'], hue=df_cleaned['Cluster'], palette='Set1', alpha=0.6)\n",
"plt.scatter(centers[:, 1], centers[:, 0], s=300, c='red', label='Centroids')\n",
"plt.title('KMeans Clustering: Networth vs Rank')\n",
"plt.legend()\n",
"\n",
"plt.tight_layout()\n",
"plt.show()\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# PCA для визуализации сокращенной размерности"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"PCA (Principal Component Analysis) — метод сокращения размерности, используемый для преобразования высокоразмерных данных в пространство с меньшим количеством измерений, сохраняя при этом как можно больше информации (дисперсии) из исходных данных.\n",
"\n",
"В контексте графиков для визуализации результатов кластеризации, PCA используется для проекции многомерных данных в двумерное пространство, чтобы можно было легко визуализировать кластеры."
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABjAAAAJHCAYAAAA+Dx+UAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3yb1dn/8Y/2sGTLe2XHibMXM+wECJRZKH0KpayHEQgtT6G0QJkdlA5Gy4aW0Rb6A1ooe5RZKKSBQBJGduIkzvDelqz9+8NYiWI5cRJL8vi+X6+8wPeR7nMk3U7uS9c55zJEo9EoIiIiIiIiIiIiIiIi/Ygx3QMQERERERERERERERHZmRIYIiIiIiIiIiIiIiLS7yiBISIiIiIiIiIiIiIi/Y4SGCIiIiIiIiIiIiIi0u8ogSEiIiIiIiIiIiIiIv2OEhgiIiIiIiIiIiIiItLvKIEhIiIiIiIiIiIiIiL9jhIYIiIiIiIiIiIiIiLS7yiBISIyCESj0XQPQXowlD+bofzaRURERCSe7g2HBn3OItLXlMAQGcTOOeccysvL4/5MmTKFo446ip/97Gc0Nzd3e05FRQW33HILxxxzDNOmTeOoo47iqquuYuXKlT32c9ddd1FeXs4vfvGLZL6cHt1zzz2Ul5enpe9EnnvuOcrLy9m8eXPSnxcIBPjVr37FSy+9tKfD3CNnnnkm5eXlvPHGG0ntp799lvuipaWFn/zkJyxevDh27JxzzuGcc85J2Rh6+/s8d+5crr322j7te82aNZx11ll9cq7NmzdTXl7Oc8891yfnExERkf5DMUt6DKaYpby8nHvuuafb8dWrVzN79myOPPJINmzYEHtseXk5d955Z8JzRSIRDj/88AF771ldXc1vf/tbjj/+eKZPn85hhx3GpZdeGheTQHLikqqqKi655BK2bNnSJ+fr6XMVkaFHCQyRQW7SpEk8/fTTsT+PPfYY559/Ps8++yzz58+Pmx3xr3/9i9NOO42vvvqKyy67jD/+8Y9ceeWVbNiwgf/5n//hww8/7Hb+SCTC888/z/jx43nhhRfw+XypfHlDXk1NDX/+858JhUJJ62P9+vUsWbKE8ePH89RTTyWtn8FmxYoVvPDCC0Qikdixm2++mZtvvjkl/e/N73Nfev3111myZEmfnKugoICnn36ao446qk/OJyIiIv2LYpbBLRUxy87WrFnD+eefj8Ph4IknnmDUqFGxNqPRyOuvv57weZ988gk1NTUpGmXf+vTTTzn11FN59913Offcc3nwwQe5/vrr6ejo4JxzzuH5559Pav8fffQR//73v/vsfE8//TTf/va3++x8IjJwmdM9ABFJLpfLxYwZM+KOHXDAAbS3t3P33XezbNkyZsyYwaZNm7jmmms4/PDD+f3vf4/JZIo9ft68eZx11llcc801vPPOO1it1ljbf/7zH6qqqrjzzjv53ve+x8svv6ybjEHmueeeo7S0lPnz53P11VezceNGRo4cme5hDUhlZWUp6Wdvf5/7K6vV2u3vMRERERk8FLNIX1q3bh3nnXceGRkZ/PnPf6akpCSufdasWSxevJjly5czadKkuLZXXnmFiRMnsmLFilQOeZ81NTXxwx/+kFGjRvHYY4/hcDhibccddxyXXHIJN910E4cddhh5eXlpHGnv6f5fRLpoBYbIEDVlyhQAtm7dCsBf//pXAoEAN9xwQ1wgAOBwOLjmmmv41re+1W0J97PPPsv48ePZb7/9OOigg3j66ad32/fcuXP51a9+xXnnnce0adO4/vrrgc6brptuuolDDjmEqVOn8j//8z8sXLgw7rl+v5/bbruNQw89lJkzZ3Ldddfh9/vjHpNoOeyiRYsoLy9n0aJFsWPr16/n+9//PgceeCAHHHAA8+fPZ926dXF9/fa3v+XII49kypQpnHzyybz66qtx541EItx///0cddRRTJ8+nQULFiRc5r6z3j7vrbfe4rvf/S4zZ85kypQpHH/88Tz55JNA57Y6Rx99NADXXXcdc+fOjT3v73//O6effjozZsxg2rRpnHrqqbz22mtx5y4vL9/ttkHhcJjnn3+eOXPmcMwxx+B0OhN+xsFgkNtvv50jjjiCadOmceGFF/L88893W17+z3/+kxNOOIGpU6dyyimnsHDhQiZNmrTL5dmvvvoqp59+OjNnzuTQQw/lpptuinuv7rnnHo4//njefPNNTjrpJKZOncqpp57KkiVLWLp0Kd/+9reZNm0aJ510UrfrafXq1cyfP59Zs2Yxa9YsLr/8ciorK2PtXdfNU089xZw5c5g1a1ZsVt+u3uNFixZx7rnnAnDuuefGrscdr83//d//5fTTT+/2ehcsWMApp5wS+3nx4sV873vfY/r06Rx44IFcc801NDQ09Ph+wd7/Pu/4mnf8Xdl57ABffvkl5513Hvvttx8zZ87k/PPPZ+nSpUDnZ3LvvfcC8Uu/I5EIDz/8MMceeyxTpkzhuOOO469//Wu3fq6++mquuOIKZsyYwQUXXNBtC6nnnnuOSZMmsWzZMr7zne8wdepU5syZwyOPPBJ3rpqaGq688srY7/hNN93EXXfdFfe7IiIiIv2XYhbFLL2JWXa0bt06zj33XNxuN0888US35AV0Jsfy8vK6rcIIhUL861//4sQTT+z2nN587g0NDfzsZz9jzpw5TJkyhQMPPJDLL788Lh4655xzuP7663n44Yc56qijmDp1KmeeeSaff/557DEdHR3ccsstHHHEEbH3c+f73J09//zz1NTU8NOf/jQueQGdK06uvvpqzj77bNra2ro9t6ftWq+99tq4z2vTpk1ceumlHHTQQUyfPp3vfOc7sRUXzz33HNdddx0ARx99dNxn9ve//50TTzwxtjXcPffcQzgcjuvnvPPO4+abb2bWrFmccMIJhMPhuDii63dj4cKF/O///i/Tp0/n0EMP5Xe/+13cudra2rjpppuYPXs2M2fO5Morr+Txxx/vV9u3icieUwJDZIiqqKgAYPjw4QB88MEHTJo0icLCwoSPnz17NldeeSX5+fmxY01NTbzzzjt885vfBOC0007jiy++4Kuvvtpt/08++SRTp07l/vvv54wzzsDv93Peeefx9ttvc+WVV3LvvfdSVFTERRddFHdj+OMf/5hnnnmG+fPn8/vf/57m5mYef/zxPX791dXVfOc732HDhg3ccsst/O53v6Ouro7zzjuPpqYmotEol19+OU899RQXXHABDzzwQOwGaMelt7/73e+47777OOOMM7j33nvxeDzccccdu+2/N8977733uPzyy5k8eTL3338/99xzD8OHD+fnP/85y5Yto6CgIPYl8WWXXRb7/yeffJKbbrqJY445hoceeojbb78dq9XK1VdfTVVVVez8Tz/9NAsWLNjlON9//31qa2v55je/id1u5xvf+Ab//Oc/CQQCcY+76aab+POf/8z3vvc97rvvPvLy8rjxxhvjHvP8889z7bXXMmvWLO6//36OO+44FixYEHfDubP777+fq666ihkzZnD33Xdz+eWX88Ybb3DOOefQ0dERe1xVVRW//vWvufTSS/nDH/5AS0sLV1xxBVdddRXf/va3ue+++4hGo1x55ZWx51VUVHDmmWdSX1/Pb37zG2699VYqKys566yzqK+vjxvHvffeyzXXXMNNN93EzJkzd/seT548mZtuuin23iTaNuqUU07hq6++YuPGjbFjLS0tvP/++5x66qlA5xL2888/H7vdzu9//3t++tOf8vHHH3PuuefGvf6d7c3v855oa2vjoosuIjs7m3vuuYe77roLn8/HhRdeSGtrK9/+9rc544wzgPil37fccgt33303p5xyCg8++CDHH388v/rVr7jvvvvizv/aa6+RkZHBAw88wEUXXZRwDJFIhB/+8IeccMIJPPzww8yaNYvf/va3fPDBB0DnXsvnnXcen332GT/96U+57bbbWLlyJY8++uhevWYRERFJPcUsill6E7N0Wb9+Peeddx4ul4snnniix+vEZDJx3HHHdUtgLFy4EL/f322yS28+92g0yvz58/nwww+5+uqreeS
"text/plain": [
"<Figure size 1600x600 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"pca = PCA(n_components=2)\n",
"reduced_data = pca.fit_transform(data_scaled)\n",
"\n",
"# Визуализация сокращенных данных\n",
"plt.figure(figsize=(16, 6))\n",
"plt.subplot(1, 2, 1)\n",
"sns.scatterplot(x=reduced_data[:, 0], y=reduced_data[:, 1], hue=result, palette='Set1', alpha=0.6)\n",
"plt.title('PCA reduced data: Agglomerative Clustering')\n",
"\n",
"plt.subplot(1, 2, 2)\n",
"sns.scatterplot(x=reduced_data[:, 0], y=reduced_data[:, 1], hue=labels, palette='Set1', alpha=0.6)\n",
"plt.title('PCA reduced data: KMeans Clustering')\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Анализ инерции для метода локтя (метод оценки суммы квадратов расстояний)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Анализ инерции для метода локтя — это техника, используемая для определения оптимального числа кластеров в задаче кластеризации (например, для алгоритма K-Means). Метод основывается на оценке суммы квадратичных отклонений (или инерции) объектов от центров их кластеров.\n",
"\n",
"Инерция (в контексте кластеризации) — это метрика, которая измеряет \"плотность\" кластеров, то есть, насколько близко точки внутри каждого кластера расположены к его центроиду.\n",
"Формально инерция определяется как сумма квадратов расстояний всех точек до ближайшего центра кластера.\n",
"\n",
"Метод локтя:\n",
"1. Для различных значений 𝑘 (количества кластеров) вычисляется инерция.\n",
"2. Значения инерции отображаются на графике в зависимости от 𝑘.\n",
"3. Смотрится точка, после которой уменьшение инерции значительно замедляется. Эта точка называется локтем, и соответствующее значение 𝑘 считается оптимальным числом кластеров."
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA28AAAImCAYAAADE77LsAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABuyklEQVR4nO3de3zP9f//8ft7s9mGjQ2b8pFT5jTnkSKSjwr1DZ0+oj6iJPIpfEh8KkX6MGepHHIoPuRDKIVPJ0U5TMlhQ4sJmdmG0c7b+/eH3/ud9w7v93vbe3sfdrteLruw1+v1fL2fe/ds9tjr+bw/DUaj0SgAAAAAgEvzcnYHAAAAAAC2UbwBAAAAgBugeAMAAAAAN0DxBgAAAABugOINAAAAANwAxRsAAAAAuAGKNwAAAABwAxRvAAAAAOAGKN4AAAAAwA1QvAEAAACAG6B4AwAXMXjwYIWHh+uxxx4r8poXX3xR4eHheumll8qxZwBK6uzZswoPD9fGjRud3RUAHoDiDQBciJeXlw4ePKiEhIQC59LS0vT11187oVcAAMAVULwBgAtp3ry5KleurG3bthU49/XXX8vf31+hoaFO6BkAAHA2ijcAcCEBAQHq1q1bocXbZ599pnvuuUeVKlUqcO6LL75Q//79FRERoTvuuENTp05VWlqaJKlHjx4KDw8v9OPs2bOSpN27d2vgwIFq3769OnXqpLFjx+r8+fMWrzF27NhC72FrOphpOmhhHzc6fPiwhg4dqk6dOqldu3Z69tln9csvv5jP7927V+Hh4dq7d68k6cSJE+rZs6cee+wxLViwoMjXWLBggSRp/fr1uu+++9SyZUuL87amoH700UeF3vfGdqapcbauK2kf7H1vrL1+UedN/x1eeukl9ejRw+J1165da/Ee3vg6Bw4csLj2ww8/VHh4uMU9MjIyNGvWLPXq1UstW7ZUu3btNGTIEMXGxlq0LapfgwcPtrjG1I/C5B8fJoMHD7a4T2Zmpt5++23de++9ioiIUK9evbR48WLl5eVZtMnfl71799rV1haj0aiJEyeqVatW2rVrl93tAECSCv4EAABwqt69e+uFF15QQkKCwsLCJEnXrl3Tt99+q+XLl+vbb7+1uP6TTz7RuHHjdP/99+uFF17QuXPnNGfOHMXFxWn58uVauHChsrKydPHiRY0aNUojRoxQ9+7dJUm1a9fWpk2bNGHCBPXt21fDhw/XpUuXNH/+fD366KP6+OOPFRISIun6D72PPvqo+vfvL0nm+9mjefPmevXVV82fr1+/Xv/973/Nn+/Zs0fDhg1Tp06d9OabbyozM1PvvfeeHnvsMX300Udq1KhRgXvOnDlTLVu21IgRIxQUFKSuXbtKkqZMmSJJ5tcLCwvT/v37NXnyZD300EOaPHmyqlSpIkl29T8jI0MRERGaPHmy+VhR7W58b/NfV9I+FOe9eeWVV9SiRYtCX3/dunWSpKNHj+r1118vcG1+V65c0dy5cws9V6VKFX311Vdq3769+dhnn30mLy/L3wmPHz9e0dHRGjNmjOrVq6fTp09r3rx5Gjt2rLZu3SqDwWC+9qGHHtLDDz9s/tz039GRjEajnn32WR08eFCjRo1S06ZNtXfvXs2dO1dnzpzRG2+8Yb42/5ht1KiR3W2tmTp1qj799FO9/fbb6tKli8O/RgCejeINAFxM9+7d5e/vr23btunvf/+7JOl///ufQkJCLH5Ylq7/MBoVFaWuXbsqKirKfLx+/fr6+9//rp07d5qLCdNTtnr16qlNmzaSpLy8PEVFRalLly6aNWuWuX27du3Uu3dvLVu2TOPHj5ckpaenq379+ua2pvvZo2rVquZ2kvTdd99ZnJ81a5ZuueUWLV68WN7e3pKkLl266K9//avmz5+vefPmWVx/+vRp7dq1S1u2bNGtt94qSeZCt2rVqpJk8Xpbt26VJL388svmokmSfH19bfY9PT1dNWvWtLhfUe1ufG/zX3fo0KES9aE4703jxo2LfH3T8czMzEKvzW/+/Pm66aabdOnSpQLn7rzzTn355Zf65z//KUlKSEjQTz/9pA4dOujcuXOSpKysLP3xxx+aPHmyevfuLUnq2LGjrl27prfeektJSUmqVauW+Z5hYWEW/TH9d3Skb7/9Vt9//71mz56tPn36SJLuuOMO+fn5ad68eXriiSfM4yn/mN25c6fdbYsya9YsrVu3TgsXLtSdd97p8K8PgOdj2iQAuBg/Pz/16NHDYurk1q1bdd9991k8qZCkkydPKiEhQT169FBOTo75IzIyUlWrVtXu3butvtapU6d08eJF9e3b1+J4vXr11LZtW+3bt8987Pz586pWrZoDvkJLaWlpOnz4sO677z5zcSJJgYGBuuuuuyz6YLp+zpw56tSpk80flk1atWolSXr//feVmJiorKws5eTk2NXWUV93SfpQ3PfGUU6cOKF169bpX//6V6Hne/Toofj4eJ08eVKStG3bNrVu3Vo333yz+RpfX18tW7ZMvXv31oULF7Rnzx6tXbvWHLqTlZVV7H7l5eUpJydHRqPR5jWmjxuv3bdvnypVqqR7773Xos0DDzxgPl+U0rSVpNWrV2vx4sXq06ePxdNZACgOnrwBgAu67777NGrUKCUkJKhy5cr64Ycf9MILLxS47vLly5KuTzErbJpZYmKi1dcxta9Zs2aBczVr1lRMTIyk60/4fv/9d9WtW7d4X4gdrl69KqPRWGQfrl69anHs2WefVWBgoMW0S1siIyM1efJkLV68WAsXLixW/86dO2d1emFZ9qG4742jTJ06VX369FHbtm0LPR8aGqqWLVvqyy+/VMOGDfXZZ5+pb9++5vFi8t133+nNN9/UyZMnVaVKFTVt2lQBAQGSZLUAK8qiRYu0aNEieXt7q2bNmurSpYv+8Y9/WIT4mJ5W36hjx46Srk8FrVGjhkUhLMn8BNDa+1matpJ07NgxdenSRZ9++qmefPJJNW/e3Or1AFAYijcAcEF33nmnqlSpom3btikgIEB169ZVy5YtC1wXGBgo6fraItMPqDcKCgqy+jrVq1eXJCUlJRU4d/HiRdWoUUOSFBsbq4yMjAIhI45QrVo1GQyGIvtg6qPJ+PHjtW3bNo0ePVqrV6+2e3rdI488ol27diknJ0evvPKK6tatqxEjRlhtk5eXp59//lkDBgyw6zXyPxktbR+K+944wueff64jR45YTKMtzN13360vv/xS9913n44cOaKFCxdaFG+//fabRo4cqZ49e+q9997TX/7yFxkMBq1evbrAtFnJ9nsnXX//HnnkEeXl5en333/XnDlz9PTTT2vLli3ma6ZMmWJRbN+4bi0oKEiXLl1Sbm6uRRFm+iWHabwXpjRtJekf//iHnnjiCfXp00eTJ0/W+vXrCxSCAGAL0yYBwAX5+vqqZ8+e2r59uz7//HPzGpv8GjZsqJCQEJ09e1YRERHmj9DQUM2aNavAk5D8GjRooFq1aunTTz+1OH7mzBkdPHhQ7dq1kyR98803atasmYKDg4v9teTl5Vn9ITUgIEAtW7bU559/rtzcXPPxq1ev6ptvvimwzq9ly5ZauHChzp07p5kzZ9rdj3nz5umbb77RW2+9pfvuu08RERE215v9+OOPSktLU6dOnaxeZ3qKlD+wo7R9KO57U1pZWVmaMWOGRo4cabEerTA9e/bUzz//rA8//FDt27dX7dq1Lc4fOXJEmZmZeuaZZ1SvXj1zcWYq3EzvmSmp0dZ7J10P2ImIiFDr1q1133336fHHH9fx48d15coV8zUNGjSw+H/hxvWFHTt2VE5OToE0V1PxZ+39LE1b6fqTUj8/P73yyis6evSoli9fbvPrBYD8ePIGAC6qd+/eGj58uLy8vCySDm/k7e2tF198Ua+88oq8vb111113KTU1VYsWLdKFCxdsTvfz8vLSmDFjNHHiRI0dO1YPPPCALl26pIULFyooKEhDhgzR0aNHtXr1avXp00cHDx40t7148aKk609YUlJSChR2KSkpiouL0+nTp81FYFHGjh2roUOH6plnntHAgQOVnZ2txYsXKysrSyNHjix
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"inertias = []\n",
"clusters_range = range(1, 90)\n",
"for i in clusters_range:\n",
" kmeans = KMeans(n_clusters=i, random_state=random_state)\n",
" kmeans.fit(data_scaled)\n",
" inertias.append(kmeans.inertia_)\n",
"\n",
"\n",
"plt.figure(figsize=(10, 6))\n",
"plt.plot(clusters_range, inertias, marker='o')\n",
"plt.title('Метод локтя для оптимального k')\n",
"plt.xlabel('Количество кластеров')\n",
"plt.ylabel('Инерция')\n",
"plt.grid(True)\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Оптимальное кол-во кластеров - примерно 75-80."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Расчитаем коэффициенты силуэта"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA1IAAAImCAYAAABZ4rtkAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACHXUlEQVR4nOzdd3iUVfrG8XvSE9JIAoSiQEBKSEINHSki7ioiYlfKqhQVZS2A8pPFhqJIkSIqAi6giAUELKss6lpAqWqAUEQE6SEdSM/M74+YkSHJZCaZZGaS7+e6vCTzvvPOM+E1zp1zznMMJpPJJAAAAACAzTycXQAAAAAAuBuCFAAAAADYiSAFAAAAAHYiSAEAAACAnQhSAAAAAGAnghQAAAAA2IkgBQAAAAB2IkgBAAAAgJ0IUgAAAABgJ4IUgBphxIgRGjFihMVjO3bs0JAhQxQdHa0PPvigSl//iSee0IABA+x+3oABA/TEE09UQUUAqkrr1q21YMECZ5cBwMm8nF0AAFSFlJQU3XfffWrXrp2WLl2q1q1bO7skAABQgxCkANRIb731lnJycjRz5kw1aNDA2eUAAIAahql9AGqctLQ0rVq1Stdff32JEHXkyBFNmDBBvXr1UocOHTRixAjt3LnT4pz//e9/GjZsmNq3b6+ePXvqqaee0rlz5yzOeeedd9S/f3+1b99ejzzyiM6fPy9Jeu2119SjRw916dJFTz31lPLy8szPycvL0zPPPKP4+Hh169bNPDXowoULmjRpkjp06KC+ffvqnXfeMT/n+PHjat26tdauXWt+LDc3V1dddZXFKFtpUxu3bt2q1q1ba+vWraV+LRWN3HXp0qXEtMQPPvhA1113nWJiYtSvXz8tWLBAhYWF5uOlTWW8uNbi1yrtn+I6y5vWWNp7ulRSUpIef/xx9ejRQx07dtTw4cP1008/mY9fOgXLZDLp9ttvV+vWrXX8+HGL86zVOmHCBF155ZUyGo0Wr//kk0/qmmuukSSdPn1ajz76qLp376727dtrxIgR+vnnnyVJCxYsKPM1iuvbv3+/HnzwQXXv3l3t2rVTnz59NH36dOXk5Fj9HmzevNlq7ba+R0n68ssvdeONN6p9+/ZWr3WxtWvXqnXr1vrll1904403Ki4uTtdff70+//xzi/OOHz+uyZMnq3fv3mrXrp169OihyZMnKy0tzXzOvn37dNddd6ljx44aOHCgVq9ebT5W2v0rlbxPypt2d/F9t2LFihL/ff34449q06aNXn311TKvcan58+erbdu2+uijj2x+DgD3x4gUgBrDZDLp1KlTmj59ugoKCjRu3DiL44cOHdKtt96qZs2aaerUqfL29taKFSs0atQoLVu2TF27dtX27dt1//33a8iQIXrsscf066+/6pVXXtHBgwf19ttvy9PTU5s2bdKzzz6rESNG6Morr9R7772nTZs2SZI+++wzTZ8+XSdOnNCsWbPk5+enKVOmSJJefvllrVmzRpMnT1ZkZKTmzp2rEydO6MSJE/rb3/6m+fPn69tvv9Wzzz6ryMhIXXXVVaW+zyVLlliEgMqYPXu2zp07p+DgYPNjb7zxhubOnavhw4drypQp2rdvnxYsWKBTp07phRdesOm67dq103vvvSepKJR9+OGH5q8DAwMdUvuFCxd0xx13qLCwUJMmTVKDBg20bNky3XPPPfroo4/UrFmzEs9Zv369RdC62M0336xbbrnF/PUzzzxjceyLL77Q1q1b1aNHD0lSTk6OPv/8c40ZM0Z5eXkaPXq08vPz9dRTT8nb21uLFi3SiBEj9P777+uWW25Rnz59LK771FNPSZIiIyOVlJSku+66Sx06dNCLL74oHx8fffvtt3rrrbdUv359jR07tszvQ05OjiIjIzVv3rxSa7f1Pf7xxx/65z//qT59+uiRRx4x3xNlXetS48aN0/Dhw/XII4/oww8/1MMPP6w33nhDffv2VXZ2tkaOHKm6devqqaeeUlBQkH766SctXLhQfn5+evbZZ5Wdna0xY8aocePGWrBggXbt2qWnnnpKjRo10pVXXmlTDfYaMWKENm7cqJdeekn9+vWTj4+P/u///k8dOnTQfffdZ9M1li5dqkWLFmn69Om68cYbq6ROAK6JIAWgxti+fbv69esnb29vvfnmmyU+SC9cuFA+Pj5asWKF+cN8v379NHjwYM2cOVMffvih1q1bp2bNmmnGjBny8PBQr1695O/vr2nTpumbb77RgAED9Prrr6tbt26aOnWqJKlbt27q1auXzp07pxkzZigmJkaSlJmZqTfffFMPPPCAjEaj3nvvPY0dO1bDhw+XJEVEROi2225TaGioZs2aJW9vb1155ZU6ePCg3njjjVKD1KlTp/Tmm2+qXbt22rt3b6W+X7t379b69evVtm1bZWZmSpLOnTunRYsW6bbbbjO/v969eys0NFRTp07V3XffrSuuuKLcawcGBqpDhw6SpO+++06SzF87ykcffaQTJ07oo48+Utu2bSVJnTp10tChQ7V9+/YSf/8XLlzQrFmzyvzeRUZGWtR4ceDr3bu3IiMjtW7dOnOQ+u9//6usrCwNHTpUP//8sw4fPqx33nlHHTt2NNdy9dVXa9GiRVqwYIEiIyMtrnvxa33//fdq27at5s2bZz7es2dPbd68WVu3brUapLKzsxUcHFxm7ba+x8TEROXn5+uRRx5Rq1atyr3WpUaMGKHx48dLkvr06aMbb7xRr776qvr27asjR44oMjJSL730ki677DJJUvfu3fXLL79o27ZtkqQTJ04oNjZW//d//6fLLrtMvXv31qpVq/Tdd99VWZAyGAyaMWOGhgwZopdfflmenp5KT0/X8uXL5enpWe7z3333Xb388st69tlndfPNN1dJjQBcF1P7ANQY0dHRevHFFxUSEqIpU6aUGLXZtm2b+vfvb/HB0MvLS9ddd5327NmjCxcu6Pnnn9e6devk4eGhgoICFRQU6JprrpGHh4e2b9+ugoICJSYmqnfv3uZr+Pr6qn379vL39zeHKKnow3dOTo4OHDigAwcOKDc31zwqIRV9kPb19VVcXJy8vb0tnrd3716LqXTFXnrpJXXp0kX9+/ev1PfKZDJp+vTpuvnmm9WmTRvz4z/99JNycnI0YMAA8/svKCgwT+PbvHmzxXUuPufSaW+21lHR5+7cuVNNmjQxhyhJ8vf31xdffGEx6lJs0aJFqlu3ru644w67X8vDw0M33nijNm7cqOzsbElFQa5nz56KjIxU165d9fPPP6tDhw4qLCxUQUGBgoOD1atXL23fvr3c6/fu3Vtvv/22fH19dejQIX355Zd67bXXlJqaajE9tDSnTp1SUFCQ3e/pUu3atZOXl5fefvttnThxQnl5eSooKJDJZLLp+RePxhgMBl199dVKSEhQTk6O2rZtq1WrVqlx48Y6cuSIvvnmGy1dulSHDx82v7+WLVvqtdde02WXXaa8vDx9++23ysjIUIsWLSxex2g0Wtx3pdVXfI4ttV922WWaOHGiPvroI33wwQeaOnWqOexZ8/XXX+uZZ55Rly5ddOutt5Z7PoCahxEpADVGYGCgbrzxRkVFRemOO+7Qww8/rPfee8/8m+WMjAxFRESUeF5ERIRMJpPOnz+vOnXqyNfXV1LRB8uLZWZmKiUlRYWFhapbt67FsdDQUIWEhFg8Vjw1Kjk52RyKLn1eSEiIQkNDSzyvoKDAYu2IVBQEN23apA0bNujTTz+15VtSpnXr1unIkSN6/fXX9dJLL5kfT09Pl6QyR0CSkpLMfz5x4kSJ71FF6li3bp0MBoPCw8PVuXNn/fOf/yzx4bk06enpCg8Pt+l1jhw5ouXLl2vJkiU6efJkhWq96aab9Prrr2vjxo3q3r27fvjhB82aNct83MfHR1LRuqmL18rYMrJhNBo1Z84cvfPOO8rKylLDhg0VFxdnvhetOXHihBo3blyBd2Tpsssu08svv6w5c+aYp2EW69q1a7nPr1+/vsXX4eHhMplMyszMlJ+fn9566y29/vrrSk9PV0REhGJiYuTv719i/WFmZqbi4+MlSfXq1dPf//5
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"silhouette_scores = []\n",
"for i in clusters_range[1:]: \n",
" kmeans = KMeans(n_clusters=i, random_state=random_state)\n",
" labels = kmeans.fit_predict(data_scaled)\n",
" score = silhouette_score(data_scaled, labels)\n",
" silhouette_scores.append(score)\n",
"\n",
"# Построение диаграммы значений силуэта\n",
"plt.figure(figsize=(10, 6))\n",
"plt.plot(clusters_range[1:], silhouette_scores, marker='o')\n",
"plt.title('Коэффициенты силуэта для разных k')\n",
"plt.xlabel('Количество кластеров')\n",
"plt.ylabel('Коэффициент силуэта')\n",
"plt.grid(True)\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Средний коэффициент силуэта (silhouette score) используется для оценки качества кластеризации. Его значение лежит в диапазоне от -1 до 1. Что означают различные значения:\n",
"\n",
"* Близко к 1.0 (0.71.0): Кластеры хорошо разделены и компактны. Это отличный результат кластеризации.\n",
"\n",
"* От 0.5 до 0.7: Кластеры четко различимы, но есть некоторое пересечение между ними. Это хороший результат.\n",
"* От 0.25 до 0.5: Кластеры перекрываются, что указывает на менее четкую границу между группами. Качество кластеризации удовлетворительное, но может потребоваться уточнение числа кластеров или доработка данных.\n",
"\n",
"* Близко к 0.0: Кластеры сильно перекрываются или распределение данных не позволяет выделить четкие группы. В этом случае нужно пересмотреть выбор числа кластеров, алгоритм или исходные данные.\n",
"\n",
"* Меньше 0.0: Плохая кластеризация: точки ближе к центрам чужих кластеров, чем к своим. Это сигнал о том, что данные плохо структурированы для текущей кластеризации."
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Средний коэффициент силуэта: 0.384\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA04AAAJzCAYAAAA4M0NGAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3gc1dXA4d/2qt1Vr5YlS7bcewUbN3qNIYFAwEAg1EDoIYF8ISQQQkwJmF5CwAEcmkMvphhsjHvvRbJk9S5tb/P9oWjttSRLwmq2z/s8TtDc2Zmzo5E0Z++956oURVEQQgghhBBCCNEmdW8HIIQQQgghhBB9nSROQgghhBBCCNEOSZyEEEIIIYQQoh2SOAkhhBBCCCFEOyRxEkIIIYQQQoh2SOIkhBBCCCGEEO2QxEkIIYQQQggh2iGJkxBCCCGEEEK0QxInIYQQQgghhGiHJE5CiB/tsssuIy8vL+rf+PHjmTt3LitXruzt8IQQx7i8vDyefPLJFtt37tzJlClTmD59OgUFBW2+/sknnyQvL48RI0bgdDpb3eeNN94gLy+PWbNmdVXYQoijlCROQogjMnToUBYuXMjChQt5/fXXeeihh9DpdFx11VXs2rWrt8MTQhxndu3axRVXXIHJZGLBggVkZWW1+5pgMMhXX33VatvHH3/cxREKIY5WkjgJIY6I1Wpl9OjRjB49mnHjxnHyySfz5JNPolareffdd3s7PCHEcWTPnj1cfvnlWCwWFixYQL9+/Tr0urFjx/LJJ5+02F5eXs7q1asZMmRIV4cqhDgKSeIkhOhyJpMJg8GASqWKbLvsssu47LLLovZ75JFHyMvLi0qwFixYwOzZsxkzZgyXXnopO3fuBODf//43eXl55OfnRx3jv//9L0OGDKG0tBSAxYsXc8kllzBmzBiGDx/O6aefzr///e+o19x9990thhg2/9u/f39kn0OH5rz55psthgZ9/PHHnHnmmYwePZrzzz+f1atXR72mvXhWrFhBXl4eK1asiHrdoderI9fP7/fzt7/9jenTpzNkyJCo93W4JPbQYz/wwAOMGDGCb7/9FjgwnKm1fwfH3ZFrX1FRwW9/+1umTJkS+R6vW7cOgFmzZrX7fVm9ejWXXnopo0aNYuLEifz2t7+lpqYmcvx3332XvLw8NmzYwJw5cxg5ciTnnHMOn376aVQcjY2N/PWvf+Xkk09mxIgRnH322bz99ttR+xwcz+DBg5kwYQI33XQTtbW1bV5LgL179/LrX/+aiRMnMmHCBK699lr27NnT5v6Hu74Hf98KCgq4+eabOfHEExk9ejSXXXYZa9asibTv378/8rr3338/6hxff/11pO1gH3/8Meeffz5jxozhxBNP5P/+7/+or69vEdvBWrsXZ82axd13393m14dqjvXg97d27VouuugiRowYwYknnsif//xnvF5vm8c41J49e5g7dy4xMTEsWLCAtLS0Dr/2zDPPZOnSpS2G63366adkZ2czePDgFq9ZvHgx559/fiTev/zlL7jd7hb7dOTnf/ny5fzyl79k1KhRnHjiifz9738nFApF9lu2bBkXXnghY8aMYcKECVx//fWHvaeEEN1DEichxBFRFIVgMEgwGCQQCFBZWckjjzyC3+/nggsuaPN1hYWFvPLKK1HbPv/8c/785z9z1lln8dRTTxEKhbjuuuvw+/2cc845GAwG/vvf/0a9ZtGiRUyZMoXU1FS++eYbbrzxRoYNG8bTTz/Nk08+Sb9+/bj//vvZsGFD1OsSExMjQwwXLlzI9ddff9j3WV9fz+OPPx61bePGjdxxxx2MHj2aZ555htTUVK677jqqqqoAOhVPZ7V2/V544QX+9a9/cfnll/Ovf/2LhQsXMn/+/E4dd+PGjbzxxhs8/vjjjBkzJqrt4Ov1f//3f1FtHXmvLpeLiy++mBUrVnDnnXcyf/58DAYDv/zlLykoKGD+/PlRMV9//fWR8yUlJbFq1SquuOIKjEYjjz/+OL///e9ZuXIlc+fObfGAfe211zJ79mzmz59PdnY2t9xyC0uWLAHA6/VyySWX8MEHH3D11Vfz9NNPM27cOO655x6effbZqONMnz6dhQsX8tprr3H77bezbNkyHnjggTavX3l5ORdddBEFBQXcd999/P3vf6eqqorLL7+curq6w177g6/vod+33bt3c/7557N//37uvfde5s2bh0ql4vLLL28xn9BisbQYdvbxxx+jVkf/yX/66ae57bbbGD16NE888QQ33ngjn332GZdddlmnEpauUFpaylVXXUVsbCzz58/n5ptv5r///S933XVXh16/d+9eLr/8cqxWKwsWLCA5OblT5z/ttNMIhUKtXrezzjqrxf4ffPABN954IwMGDOCpp57i17/+Ne+//z433HADiqIAnfv5v+OOOxg3bhzPPvssZ599Ni+++CJvvfUWAEVFRdxwww0MHz6cZ555hgceeID8/HyuueYawuFwp96nEOLIaHs7ACHE0W3VqlUMGzasxfbbbruNnJycNl/34IMPMnDgQLZs2RLZVlNTwyWXXMJtt90GNPWgNH9aP2TIEE455RTef/99fvOb36BSqSgrK+OHH37g73//O9D0cDlnzhzuueeeyDHHjBnDpEmTWLFiBaNGjYps1+v1jB49OvL13r17D/s+n3jiCdLS0qJ6G8rKyjjttNP4y1/+glqtJiEhgbPPPpv169dz8skndyqezmrt+m3cuJHBgwfzy1/+MrKtuaemo5p7/GbPnt2i7eDr5fP5oto68l7fe+89iouLee+99yJDn8aOHctPfvITVq1axc9+9rOomDMzM6PO+cgjj5Cdnc1zzz2HRqMBYNSoUZx11lm88847/OIXv4jse9lll3HjjTcCMG3aNObMmcNTTz3F9OnTeffdd9m5cydvvvlmJDmcNm0awWCQp59+mp///Oc4HA4A4uLiIjFMmDCB77//PuqaH+qVV17B7/fzz3/+k8TERAAGDx7MxRdfzIYNG5g+fXqbrz34vR76fZs/fz56vZ5XX30Vq9UKwIwZMzj77LN5+OGHo3rLTjrpJL777jv8fj96vR6fz8eXX37JhAkTIj2E9fX1PPPMM1x44YVRSfCgQYP4xS9+0eJ6drcXXniB2NhYnnrqqcj3Vq1Wc++997Jjx44WvV4HKygoYO7cuVRVVREIBH5UMpGQkMCECRP45JNPOPfccwEoLi5mw4YNPPzwwzzzzDORfRVFYd68eUybNo158+ZFtmdlZXHFFVewZMkSZsyY0amf/5/97GeR+3XKlCksXryYb775hp///Ods3LgRr9fLtddeG0kIU1JS+PLLL3G73ZH7QQjR/SRxEkIckWHDhvGnP/0JaHqgaGho4Ntvv+Wxxx7D7XZz6623tnjNt99+y/fff88LL7zA3LlzI9t//vOfAxAOh3G73Xz++ecYjUbS09MB+OlPf8qHH37I6tWrmTBhAosWLcJisXDKKacAcPXVVwNNPRv5+fkUFhayadMmoCkJ+7F27twZ6XVojhHg1FNP5dRTT0VRFNxuN5988glqtZrs7Oxujaet6zdixAief/55PvvsMyZPnozFYunwQ6SiKKxbt46PP/64RU9WR3Tkva5Zs4aMjIyo+SImk4nPPvus3eN7PB42bNjAVVddFenlBOjXrx85OTksW7Ys6kF/zpw5kf9WqVSccsopPPnkk3i9XlauXEl6enqLHrVzzz2Xt99+OyrBaT5XOBxm+/btrFmzhhNOOKHNONesWcPo0aMjSRM0PeR+/fXX7b7Hw1m5ciUzZ86MekjWarWR3lmXyxXZPnnyZL799ltWrFjBtGnT+Pbbb7FarYwfPz6SOK1fvx6/38/ZZ58ddZ7x48eTnp7OypUrjzhxar52arW6RW9Xs3A4TDAYZPXq1UydOjWSNEFTAghN1/RwidOHH37I8OHDeeyxx/jlL3/JnXfeySuvvBJ1zlAoFOkJgqZ74uBzQdNwvb/85S84nU6sVisfffQRw4YNo3///lH77d27l7KyMq699trIfQhNibXVamXZsmXMmDGjUz//h96LKSkpkWF/o0a
"text/plain": [
"<Figure size 1000x700 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from sklearn.metrics import silhouette_score\n",
"from sklearn.cluster import KMeans\n",
"\n",
"# Применение K-Means\n",
"kmeans = KMeans(n_clusters=85, random_state=42) \n",
"df_clusters = kmeans.fit_predict(data_scaled)\n",
"\n",
"# Оценка качества кластеризации\n",
"silhouette_avg = silhouette_score(data_scaled, df_clusters)\n",
"print(f'Средний коэффициент силуэта: {silhouette_avg:.3f}')\n",
"\n",
"# Визуализация кластеров\n",
"from sklearn.decomposition import PCA\n",
"\n",
"pca = PCA(n_components=2)\n",
"df_pca = pca.fit_transform(data_scaled)\n",
"\n",
"plt.figure(figsize=(10, 7))\n",
"sns.scatterplot(x=df_pca[:, 0], y=df_pca[:, 1], hue=df_clusters, palette='viridis', alpha=0.7)\n",
"plt.title('Визуализация кластеров с помощью K-Means')\n",
"plt.xlabel('Первая компонентa PCA')\n",
"plt.ylabel('Вторая компонентa PCA')\n",
"plt.legend(title='Кластер', loc='upper right')\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Результат неплохой, показывает умеренно хорошую кластеризацию."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}