1070 lines
1.9 MiB
Plaintext
Raw Normal View History

2024-11-22 23:48:04 +04:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Вариант 19 Данные о миллионерах https://www.kaggle.com/datasets/surajjha101/forbes-billionaires-data-preprocessed"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['Rank ', 'Name', 'Networth', 'Age', 'Country', 'Source', 'Industry'], dtype='object')\n"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from scipy.cluster.hierarchy import dendrogram, linkage, fcluster\n",
"from sklearn.cluster import KMeans\n",
"from sklearn.decomposition import PCA\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.metrics import silhouette_score\n",
"\n",
"df = pd.read_csv(\"..//static//csv//Forbes Billionaires.csv\")\n",
"print(df.columns)"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Rank</th>\n",
" <th>Name</th>\n",
" <th>Networth</th>\n",
" <th>Age</th>\n",
" <th>Country</th>\n",
" <th>Source</th>\n",
" <th>Industry</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>Elon Musk</td>\n",
" <td>144.0</td>\n",
" <td>50</td>\n",
" <td>United States</td>\n",
" <td>Tesla, SpaceX</td>\n",
" <td>Automotive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>Jeff Bezos</td>\n",
" <td>138.0</td>\n",
" <td>58</td>\n",
" <td>United States</td>\n",
" <td>Amazon</td>\n",
" <td>Technology</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Bernard Arnault &amp; family</td>\n",
" <td>133.0</td>\n",
" <td>73</td>\n",
" <td>France</td>\n",
" <td>LVMH</td>\n",
" <td>Fashion &amp; Retail</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>Bill Gates</td>\n",
" <td>129.0</td>\n",
" <td>66</td>\n",
" <td>United States</td>\n",
" <td>Microsoft</td>\n",
" <td>Technology</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>Warren Buffett</td>\n",
" <td>118.0</td>\n",
" <td>91</td>\n",
" <td>United States</td>\n",
" <td>Berkshire Hathaway</td>\n",
" <td>Finance &amp; Investments</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Rank Name Networth Age Country \\\n",
"0 1 Elon Musk 144.0 50 United States \n",
"1 2 Jeff Bezos 138.0 58 United States \n",
"2 3 Bernard Arnault & family 133.0 73 France \n",
"3 4 Bill Gates 129.0 66 United States \n",
"4 5 Warren Buffett 118.0 91 United States \n",
"\n",
" Source Industry \n",
"0 Tesla, SpaceX Automotive \n",
"1 Amazon Technology \n",
"2 LVMH Fashion & Retail \n",
"3 Microsoft Technology \n",
"4 Berkshire Hathaway Finance & Investments "
]
},
"execution_count": 56,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Rank</th>\n",
" <th>Networth</th>\n",
" <th>Age</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>2600.000000</td>\n",
" <td>2600.000000</td>\n",
" <td>2600.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>1269.570769</td>\n",
" <td>4.809596</td>\n",
" <td>64.271923</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>728.146364</td>\n",
" <td>9.845084</td>\n",
" <td>13.220607</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>19.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>637.000000</td>\n",
" <td>1.500000</td>\n",
" <td>55.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>1292.000000</td>\n",
" <td>2.400000</td>\n",
" <td>64.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>1929.000000</td>\n",
" <td>4.500000</td>\n",
" <td>74.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>2578.000000</td>\n",
" <td>144.000000</td>\n",
" <td>100.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Rank Networth Age\n",
"count 2600.000000 2600.000000 2600.000000\n",
"mean 1269.570769 4.809596 64.271923\n",
"std 728.146364 9.845084 13.220607\n",
"min 1.000000 1.000000 19.000000\n",
"25% 637.000000 1.500000 55.000000\n",
"50% 1292.000000 2.400000 64.000000\n",
"75% 1929.000000 4.500000 74.000000\n",
"max 2578.000000 144.000000 100.000000"
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe()"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Rank 0\n",
"Name 0\n",
"Networth 0\n",
"Age 0\n",
"Country 0\n",
"Source 0\n",
"Industry 0\n",
"dtype: int64\n",
"Rank False\n",
"Name False\n",
"Networth False\n",
"Age False\n",
"Country False\n",
"Source False\n",
"Industry False\n",
"dtype: bool\n"
]
}
],
"source": [
"# Процент пропущенных значений признаков\n",
"for i in df.columns:\n",
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
" if null_rate > 0:\n",
" print(f'{i} Процент пустых значений: %{null_rate:.2f}')\n",
"\n",
"print(df.isnull().sum())\n",
"\n",
"print(df.isnull().any())"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Rank int64\n",
"Name object\n",
"Networth float64\n",
"Age int64\n",
"Country object\n",
"Source object\n",
"Industry object\n",
"dtype: object"
]
},
"execution_count": 59,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Проверка типов столбцов\n",
"df.dtypes"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Атрибуты \n",
"\n",
"Rank: Рейтинг миллиардера в списке Forbes.\n",
"\n",
"Name: Имя миллиардера.\n",
"\n",
"Networth: Чистая стоимость в миллиардах долларов США.\n",
"\n",
"Age: Возраст миллиардера.\n",
"\n",
"Country: Страна, в которой проживает миллиардер.\n",
"\n",
"Source: Основной источник богатства \n",
"\n",
"Industry: Индустрия, в которой миллиардер заработал свое состояние.\n",
"\n",
"# Цель:\n",
"Оптимизация стратегий инвестирования и маркетинга для финансовых учреждений и компаний, стремящихся привлечь миллиардеров как клиентов или партнеров.\n",
"Кластеризация миллиардеров на основе их характеристик (возраст, страна проживания, источник богатства, индустрия) для выявления групп с похожими профилями."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Очистка данных"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Цель: Упростить набор данных, удалив несущественные столбцы, чтобы сосредоточиться на ключевых атрибутах, которые будут использоваться для кластеризации и анализа.\n",
"- Rank - этот столбец можно удалить, так как он не влияет на характеристики миллиардера\n",
"- Name - этот столбец можно удалить, так как он не является количественным атрибутом"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Networth Age Country Source Industry\n",
"0 144.0 50 United States Tesla, SpaceX Automotive \n",
"1 138.0 58 United States Amazon Technology \n",
"2 133.0 73 France LVMH Fashion & Retail \n",
"3 129.0 66 United States Microsoft Technology \n",
"4 118.0 91 United States Berkshire Hathaway Finance & Investments \n"
]
}
],
"source": [
"# Удаление несущественных столбцов\n",
"columns_to_drop = ['Rank ', 'Name']\n",
"df_cleaned = df.drop(columns=columns_to_drop)\n",
"\n",
"print(df_cleaned.head()) # Вывод очищенного DataFrame"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Визуализация парных взаимосвязей\n",
"Визуализировать ключевые атрибуты миллиардеров для выявления закономерностей и связей между ними."
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABjAAAA3QCAYAAADIJEmGAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdeXidZZk/8G+6QQtJZC0EpCFFKEhbFillRgSqgyK4IDqikCIjyFLAqmwjKCDKuKCAFHBhkwiyiAOoiAqKqFOgdRT4SRFqCYuhFBRDgEJbmt8fZ5oSkkLSJD1vcj6f6+JK3vd5zzl3Tm6S0/PN8zxV7e3t7QEAAAAAACiQYeUuAAAAAAAA4NUEGAAAAAAAQOEIMAAAAAAAgMIRYAAAAAAAAIUjwAAAAAAAAApHgAEAAAAAABSOAAMAAAAAACgcAQYAAAAAAFA4AgwAAID/097eXpGPDQAARSTAAACACvCZz3wm22yzTS699NJyl9Jnd911V7bZZpscffTR3Y7/6Ec/yjbbbJPHH3+8V/d74YUX5pJLLumPEnvtoYceykc+8pFO57bZZpucf/75ZakHAACKQIABAABDXFtbW2699dZsvfXWueaaa4bMX/rfdtttuemmm/rt/s4777wsXry43+6vN2655Zb88Y9/LMtjAwBAUQkwAABgiPvJT36SJDnllFPS3NycO++8s8wV9Y+ampp86UtfytNPP13uUgAAgAEgwAAAgCHu+uuvz2677ZapU6dm3Lhxufrqq7tcc8kll+Ttb397Jk2alAMPPDC/+tWvss022+Suu+7quObBBx/MEUcckZ122ik77bRTZsyYkccee2yVj/vjH/8422yzTR588MFO52+99dZss802uf/++5Mk3/ve9/Kud70rEydOzO67757TTz89zz333Ot+XZ/61Kfywgsv5PTTT3/da1taWvLpT386U6ZMyeTJk3PIIYd0PH5SWq4pSWbNmpVtttkmV1xxRSZMmJBnnnmm45oLLrgg22yzTWbPnt3pa5kwYUKefPLJJMl9992Xj3/849l1112z00475cgjj8xDDz3Ucf2K5a+uvvrq7LXXXtlpp51y4IEHZtasWR11vHLZqOeeey6nnHJKpkyZkh133DHHHXecwAYAgIohwAAAgCHsoYceyn333Zf3v//9SZL3v//9ue222zq9CT5r1qycffbZ2WeffXLhhRdm8uTJmTlzZqf7efjhh3PggQfm73//e77yla/kS1/6Uh577LF85CMfyd///vduH/sd73hHxowZk5/+9Kedzv/kJz/Jm970pmy33Xb5yU9+kq997Ws56KCDcskll2TGjBm58cYbc+aZZ77u1zZ+/Pgce+yx+eUvf9kxy6Q7//jHP3LggQfmz3/+cz73uc/l61//epYvX56DDjoof/3rX5Mk11xzTZLkgx/8YK655prsueeeaW9v7zRbZcXnc+bM6Th3xx13ZLvttsvYsWNz5513duxjcdZZZ+WLX/xinnjiiRx44IEdj7PCrFmzctJJJ+Xzn/98vv71r+eDH/xgRx0f+tCHOq674oorsnTp0px33nn5zGc+k1/96lf5whe+8LrPDQAADAUjyl0AAAAwcK6//vq84Q1vyLRp05Ik+++/f84///z88Ic/zJFHHpkXXngh3/3ud3PQQQfl+OOPT5K89a1vzeLFizve1E9Kb7iPHj06l19+edZdd90kyW677ZZ3vOMdufjii3PSSSd1eezRo0fnne98Z26++eZ86lOfSpI8//zz+fWvf50ZM2YkSe6+++5svvnmOeiggzJs2LBMmTIlY8aMSWtra4++vo9//OP55S9/mTPPPDNTp07Nhhtu2OWa733ve/nnP/+ZH/zgB9lss82SJG9729vy7ne/O+edd16++c1vZocddkiSbLLJJh2fb7nllpk9e3b22WefLF68OH/84x/z5je/uVOA8dvf/jYf+MAHkiRf//rXM27cuHznO9/J8OHDO57Lf/u3f8s3v/nNnHfeeR23++hHP5p3vetdHcebbLJJknQ89goTJ07MV7/61Y7n+5577slvfvObHj03AAAw2JmBAQAAQ9TSpUtz00035R3veEdefPHFPPvss1lnnXWy884759prr83y5cvzpz/9KS+++GKnN9OTZL/99ut0fOedd2bKlClZe+21s2zZsixbtizrrrtu3vKWt+R//ud/VlnD+973vjz66KO59957k5Q23l6yZEne+973JkmmTp2ahx9+OB/4wAcya9as3HfffXnPe96TxsbGHn2Nw4cPz3/913/lhRdeyBlnnNHtNbNnz862226bsWPHdtQ+bNiwvO1tb3vN2vfcc8+O8T/84Q8ZOXJkpk+fnnvuuSdLlizJ/Pnz09LSkj333DMvvPBC7rvvvuyzzz4d4UVS2qdjr732yt13393pvrfddtsefX0777xzp+PNN988zz77bI9uCwAAg50ZGAAAMETdfvvt+fvf/54f/vCH+eEPf9hl/Le//W3a2tqSJOuvv36nsQ022KDT8T//+c/cfPPNufnmm7vcz6tv+0q77rprxo4dm5/+9KeZNGlSfvrTn2bKlCkdMw7e/e53Z/ny5bnqqqty4YUX5vzzz89mm22W448/Pu9+97t79HVutdVWOeaYY/KNb3yjy3JVK2p/5JFH8uY3v7nb2y9evDijR4/ucn6PPfbIZZddlscffzyzZ8/OTjvtlN122y0vvfRS7rnnnvy///f/stFGG2X77bfPokWL0t7e3u0MkA033LDjeV5hzJgxPfraXn3dsGHD0t7e3qPbAgDAYCfAAACAIer666/PG9/4xnzpS1/qdL69vT3HHHNMrr766nz84x9Pkvz9739PQ0NDxzX/+Mc/Ot2muro6//Iv/5JDDz20y+OMGLHqf1YMGzYs73nPe/KTn/wkRx55ZH7/+9932cNhv/32y3777Ze2trb87ne/y3e/+92ccMIJ2XnnnTN27Ngefa2HHXZYfvGLX+TMM8/s+JpeWfuUKVNy4okndnvbUaNGdXv+LW95S9Zdd93Mnj07d955Z975zndm7Nixqa+vz1133ZU//OEP2XPPPVNVVZXq6upUVVV1u8H2U089lTe84Q09+joAAICVLCEFAABD0FNPPZXf/va32XfffbPrrrt2+m/q1Kl517veld/85jfZdNNNU11dnV/+8pedbv+LX/yi0/GUKVMyf/78bLvttpk4cWImTpyY7bffPpdffnmX277a+973vixcuDAXXHBBhg8fnr333rtjbObMmR37YVRXV2efffbJ0UcfnWXLlmXRokU9/nqHDx+eL3/5y3nuuefy7W9/u0vtDz/8cLbccsuO2idOnJgbb7wxP/zhDzuWfBo2rPM/j0aOHJl//dd/zW233ZZ58+ZlypQpSUrLXt1+++2ZO3du9tprrySlmRLbb799fvazn+Xll1/uuI+2trbcfvvtXZaCerVXPzYAACDAAACAIemGG27IsmXLsu+++3Y7/v73vz8vv/xyfvSjH+Wwww7L97///Zxzzjn5/e9/n3POOSc/+MEPkqx8Y/3oo4/Oo48+miOOOCK33nprfvvb3+bYY4/NT3/600yYMOE1a9l6662z7bbb5qqrrso73vGOjk3Ak1IYcOutt+YrX/lKZs+enZ///Oc577zzUl9f/7r3+2pvetObMmPGjC7LNX3sYx/L8uXL87GPfSw333xzZs+enc997nNpamrKlltu2XFdTU1N/vd//zdz5szpWKZpjz32yK9//eustdZa2X777ZOUlsW67777UlVVlX/5l3/puP1nPvOZPPzww/nEJz6R2267LbfccksOOeSQLFmypCOkWZWampokyU9+8pM89thjvfq6AQBgqBJgAADAEPSjH/0ob3rTm7L11lt3O77zzjtn8803z3XXXZfDDz88xx57bG688cYcccQRmTt3bo4//vgkK/dgmDBhQq688spUVVXlxBNPzHHHHZennnoqF1xwQacZFavyvve9Ly+//HLH5t0rHHjggTn11FNzxx135Mgjj8znP//5jB8/PpdeemlGjhzZ66/78MMP77LXxdi
"text/plain": [
"<Figure size 1600x4500 with 3 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Настройка стиля графиков\n",
"sns.set(style=\"whitegrid\")\n",
"\n",
"# Создание фигуры\n",
"plt.figure(figsize=(16, 45))\n",
"\n",
"# График 1: Возраст vs Чистый доход\n",
"plt.subplot(4, 1, 1)\n",
"sns.scatterplot(x=df_cleaned['Age'], y=df_cleaned['Networth'], alpha=0.6, color='blue')\n",
"plt.title('Age vs Networth')\n",
"plt.xlabel('Age')\n",
"plt.ylabel('Networth (in billions USD)')\n",
"\n",
"# График 2: Страна проживания vs Чистый доход\n",
"plt.subplot(4, 1, 2)\n",
"sns.boxplot(x=df_cleaned['Country'], y=df_cleaned['Networth'], color='green')\n",
"plt.title('Country vs Networth')\n",
"plt.xlabel('Country')\n",
"plt.ylabel('Networth (in billions USD)')\n",
"plt.xticks(rotation=90)\n",
"\n",
"# График 3: Индустрия vs Чистый доход\n",
"plt.subplot(4, 1, 3)\n",
"sns.boxplot(x=df_cleaned['Industry'], y=df_cleaned['Networth'], color='purple')\n",
"plt.title('Industry vs Networth')\n",
"plt.xlabel('Industry')\n",
"plt.ylabel('Networth (in billions USD)')\n",
"plt.xticks(rotation=90)\n",
"\n",
"# Упорядочиваем графики\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Стандартизация данных для кластеризации"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.preprocessing import OneHotEncoder\n",
"\n",
"# Выделяем числовые и категориальные признаки\n",
"numerical_cols = ['Networth', 'Age']\n",
"categorical_cols = ['Country', 'Source', 'Industry']\n",
"\n",
"# Масштабирование числовых признаков\n",
"scaler = StandardScaler()\n",
"df_numerical_scaled = scaler.fit_transform(df_cleaned[numerical_cols])\n",
"\n",
"# Кодирование категориальных признаков с помощью OneHotEncoder\n",
"encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False) # sparse=False для удобства\n",
"encoded_data = encoder.fit_transform(df_cleaned[categorical_cols])\n",
"\n",
"# Создаем новые столбцы для закодированных категориальных признаков\n",
"encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_cols))\n",
"\n",
"# Объединяем числовые и закодированные категориальные данные\n",
"df_encoded = pd.concat([df_cleaned[numerical_cols], encoded_df], axis=1)\n"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABR8AAAP0CAYAAAAjkkunAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdeXxU9b3/8fc5zExmMkkmOEJMmCCY1LiBUqO4LyDWWrW4tVSr1atXq9halytaW9e61CpFwaXWnbYXa1XEWvUqWGm1LtGrtS74M4LMaEzCkHUmk5lhzu8Pb1JCAmRgTmbh9Xw8eADnnJz5JPkmkPf5fr5fw7IsSwAAAAAAAACQYWa2CwAAAAAAAABQmAgfAQAAAAAAANiC8BEAAAAAAACALQgfAQAAAAAAANiC8BEAAAAAAACALQgfAQAAAAAAANiC8BEAAAAAAACALQgfAQAAAAAAANiC8BEAAGAbYVlWtkvANohxBwDAto3wEQAAbLXTTjtNdXV1A37tscceOuyww3Tttdeqo6Nj0NusXLlS11xzjY444ghNnjxZhx12mC6++GJ99NFHG32dX//616qrq9P111+/2ZrOPPNM7bvvvorH4xu95thjj9Wpp54qSaqrq9P8+fOH8d5m1uWXX65p06b1/33atGm6/PLLM/oaX375pc455xx9/vnntr7O1uju7tYPf/hD7bnnntpnn320atWqQdc88cQTqqur0y9+8Ysh7zF//nzV1dXZXOnW63s/QqFQxu75+uuvq66uTq+//vpGr7n88ssHfZ2u/+u5557LWD2SFI/HdeONN+rpp5/O6H0BAEB+cWS7AAAAUBh22203XX311f1/TyQSev/99zV37lx9+OGH+u///m8ZhiFJ+p//+R9ddtll+trXvqbzzjtPgUBAX375pR5++GF95zvf0d13360DDzxwwP1TqZQWL16snXfeWU899ZQuvfRSeTyejdZz4okn6tVXX9Xy5ct1xBFHDDr//vvv6+OPP9Yvf/lLSdKjjz6qHXbYIRMfiq2yYMEClZSUZPSer776ql5++WXbX2drLF68WC+99JKuuuoqfe1rX1MgENjotb///e911FFHqb6+fgQrzJzDDjtMjz76qMaOHTvirz1mzBgtWLBgyHMTJkzI6Gu1tLTo4Ycf1k033ZTR+wIAgPxC+AgAADKipKREe+2114Bj++yzjyKRiO644w69++672muvvbR69WrNmTNHBx98sObNm6dRo0b1X3/kkUfqe9/7nubMmaNly5bJ5XL1n/v73/+uL7/8UnPnztX3v/99/fnPf9bJJ5+80XpmzJghn8+nJUuWDBk+PvnkkyopKdE3vvENSRpUe7bstttuBfU6w9Xe3i5JOuWUU/pD6o0pKSnRT3/6Uy1ZskRut3sEqsus7bbbTtttt11WXtvlcuXMWAcAANsG2q4BAICt9thjD0nSF198IUlauHCh4vG4fvaznw0IHiXJ4/Fozpw5OvHEEwe1aj/++OPaeeedtffee2vq1Kl69NFHN/m6RUVFOuaYY/TXv/5V3d3dA84lEgk988wz+ta3vtU/e3LDtuuHH35YRx11lCZNmqSDDz5Y11xzTf99QqGQ6urq9MQTTwy474Yt1OvWrdO9996rY445RpMnT9Zee+2lWbNm6bXXXtto3eu3Q/e1EQ/1q6/Wzb3GE088oSuuuEKSNH369P57b9h23dXVpZtuuklHHHGEJk2apGOOOUZ/+tOfBtV2xx136Je//KUOOOAATZ48WWedddaQLdLr6+3t1Z133tn/8TzyyCN17733KpVKSfqqbb/v/dlll1022w4+Z84crV69WnPnzt3kdRt+PqTBn7u+duV//OMfOu200/qXAHjsscfU0tKiCy64QFOmTNGhhx6qhx56aMC92tvbddVVV+mAAw7QpEmT9J3vfEf/+Mc/BlxTV1enBQsW6IQTTtDkyZO1YMGCIduuX375Zc2aNUt77bWXDjroIF111VXq7OzsP//mm2/qrLPO0j777KM99thD06ZN0/z58/s/hpn24osv6oQTTtCkSZN04IEH6he/+IWi0eiga0455RRNmTJFe+yxh4466ij9/ve/l/TVx3n69OmSpCuuuKL/83DaaafptNNOG3CfDVvGn3jiCe2222567LHHdOCBB2rffffVJ598Mqy6YrGYrrnmGh1yyCH9Nd1///22fIwAAMDwED4CAABbrVy5UpJUXV0tSfrb3/6m3XbbTRUVFUNev//+++uiiy7SmDFj+o+1t7dr2bJlmjlzpiTp+OOP13vvvaf3339/k6994oknqre3V88///yA48uXL9fatWs3OnPyz3/+s371q1/p1FNP1f3336/Zs2frqaeeGtZak+u79dZbddddd+m73/2u7rvvPl1//fVqb2/XhRdeqJ6ens2+/cknn6xHH310wK+9995bXq9XRx999LBe47DDDtN5550n6atW6/PPP3/Q68RiMZ1yyil6+umndfbZZ+uuu+7S3nvvrSuvvFL33HPPgGsfeeQRffrpp7rpppv0i1/8Qv/61780Z86cjb4PlmXphz/8oe677z6dfPLJuueee3TUUUdp3rx5/W36V199tU466SRJX7W/D1Xj+vbbbz9997vf1cKFC/XWW29t9uM4HBdffLGmTZum3/zmN5o4caKuvvpqnX766fra176mu+66S5MnT9ZNN92kf/7zn5K+ClR/8IMfaOnSpbrooou0YMEC7bDDDjr77LMHBZD33HOPjj32WN1xxx39M23X99JLL+ncc8+V3+/XvHnzdOmll+rFF1/URRddJEn66KOPdMYZZ6i8vFy//vWvdffdd6u+vl4LFizQs88+m/b7mkwmB/1af1OYp59+WrNnz9ZOO+2kO++8UxdccIGWLFmi888/v/+6v/71r5o9e7Z233133XXXXZo/f76qq6t13XXX6d1339XYsWP727vPO++8jbZ6b8y6dev0wAMP6IYbbtAVV1yhmpqaYdV14403avny5ZozZ47uv/9+TZ8+Xbfccosef/zxtD9OAAAgM2i7BgAAGWFZlpLJZP/fOzo69MYbb+juu+/unxklfbX5ya677prWvZ9++mmlUil9+9vflvRVe/Z1112nRYsWbTIQ3H333bXrrrvq6aef1oknnth/fPHixaqrq9OkSZOGfLs33nhDgUBAp556qkzT1L777qvi4uIhN87ZlJaWFl100UUDZnoVFRXpRz/6kVasWLHZ9tcddthhwDqUDz30kN5++20tWLBANTU1w36N8ePHS5J23XXXIddSfOKJJ/Txxx9r0aJFmjJliiTp4IMPVjKZ1F133aVZs2apvLxcklRWVqa77rqrf9bq6tWrNX/+fLW1tWn06NGD7r18+XK9+uqrmjt3rr71rW9Jkg488EC53W7dfvvt/QFf3/s53Jbgyy67TH/729/005/+VE899dRWt1+feOKJOvPMMyVJxcXF+s53vqPJkyfrwgsvlPTVjMz/+Z//0dtvv63Jkyfrqaee0kcffaQ//vGP2nPPPSVJhxxyiE477TTdeuutA8Ku+vr6/ntL0nvvvTfgtefPn69dd91VCxYs6G85d7lcuv3227VmzRp99NFHOuCAA/SrX/1KpvnV3IEDDzxQy5Yt0+uvv97/cR2Ozz//XLvvvvug45dcconOOeccWZalW2+9VQcffLBuvfXW/vMTJkzQGWecoZdfflmHHXaYPvnkEx1//PG68sor+6+ZMmWKpk6dqtdff1177rln/9f5+PHjt6jN/4c//KEOO+wwSRp2XW+88YYOPPDA/o/J1KlTVVxcLL/fn/brAwCAzCB8BAAAGfHmm28OCjVM09QBBxyg6667rj9UGTVqlNatW5fWvR9//HFNnTpVLpervxV12rRp+vOf/6w5c+ZscuOUE088UTfeeKOam5tVUVGh9vZ2vfTSS7rssss2+jb77befHn30UZ1wwgk64ogjdOihh+rYY4/d7FqEG7rtttskSWvXrtWnn36qzz77TC+99JIkbXIX7qH87W9/0y233KLzzz9/wBqWmXiNN954Q+PGjesPHvscd9x
"text/plain": [
"<Figure size 1600x1200 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Применение PCA ТОЛЬКО к числовым данным\n",
"pca = PCA(n_components=2)\n",
"kc_pca = pca.fit_transform(df_numerical_scaled)\n",
"\n",
"# Визуализация\n",
"plt.figure(figsize=(16, 12))\n",
"plt.scatter(kc_pca[:, 0], kc_pca[:, 1], alpha=0.6)\n",
"plt.title(\"PCA Visualization of Numerical Features\")\n",
"plt.xlabel(\"Principal Component 1\")\n",
"plt.ylabel(\"Principal Component 2\")\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Агломеративная (иерархическая) кластеризация"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABSEAAAP1CAYAAACe9CqJAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAADd+klEQVR4nOzdeXiV5Z344e+BJICKgqigggpYcQURRZ1WUbDqVLsgrVoVt/oTFRUV61L33bGAsriAjAu4K1bt1NbWpdgZFYuK1NYpdS0oqIjgCgnh/f3B5JRAgCTkycly39fFdZ2c9cnZwvmc533eXJZlWQAAAAAAJNKi0AMAAAAAAJo2ERIAAAAASEqEBAAAAACSEiEBAAAAgKRESAAAAAAgKRESAAAAAEhKhAQAAAAAkhIhAQAAAICkREgAAAAAICkREoCCueCCC6JHjx5V/rvgggsKPTxgBYsWLYo+ffrEzJkzY9GiRXHqqafGHXfcUehh0QDMnDkzDjrooCgtLS30UGjC3nnnnejfv398/vnnhR4KALVUVOgBANC8bbrppjFu3LhKx51++ukFGg2wOhtttFGccMIJcfjhh0eWZdGjR4/4j//4j0IPiwJbsmRJnH/++fHzn/88SkpKCj0cmrBu3brFgAED4uqrr44bbrih0MMBoBZESAAKpry8PNZbb73YddddKx3vgyw0TKeffnoceeSR8fnnn8fWW28dLVu2LPSQKLD77rsvioqK4oADDij0UGgGTj755Nhvv/3iuOOOi5122qnQwwGghmyODUDBLF26NFq3bl2t806fPj2OOeaY6NWrV/Tt2zfOP//8WLBgQf70Rx99NHr06BFz5sypdLn+/ftX2rS7rKxstZuAr3xdr7/+egwcODB69uwZ3//+9+N3v/tdpev+4osv4rrrrosDDjggdtlllzj00EPjkUceWeX2V76dOXPmxODBg+OCCy6I2267Lf7t3/4t+vTpE6eddlp88MEHlS7/9NNPx1FHHRW9e/eOnXfeOQ4++OC4995786dPmzYtf72vvPJKpcvec8890aNHj+jfv/8q47n44osrnXfRokWx8847R48ePWLatGnVvv3Vefjhh+Owww6LXXfdNXr27Bk//OEP47e//e0q93FVm+Cv7vEZPHhwpdt48skn47DDDovevXvHt7/97bj00ktj0aJF+dPHjh0bPXr0iN69e6+ymeiZZ565ymb/S5YsiRtuuCH69esXO++8c3z/+9+PJ598stLl+vfvHzfeeGNce+21sccee8See+4Z5513XixcuLDav/+aliF49NFH84/pio/Dp59+GrvvvnuVj2WPHj1i++23jz322CPOOOOM+Oyzz/Ln6dGjR4wdO7bS2Crul9rclxERm2yySXTr1i1eeOGFtS6dsPJt/eY3v4k99tgjRo4cGRGVn78r/1tx3P/7v/8bp59+euy1116x0047xT777BNXX311LF68OH+e0tLSuOmmm2LAgAHRs2fPOPTQQ+NXv/pVte7ziIgPP/wwzjnnnOjbt2/06tUrjjvuuPjb3/6Wv/45c+ZEjx494je/+U2ccsop0atXr9hvv/3i5ptvjmXLllV6XFa+T84555xKj2mWZTF69OjYZ599ok+fPnHKKafE3Llz8+cvLy+PCRMmxKGHHho9e/aMXXfdNY488sh46aWX1vg4Rqz6mK/8c5ZlceSRR1Z6v7zgggsqPbciIh544IEqnz8rKi0tjTvvvDMOPfTQ/HGDBw9e5bW68nO6qrH/6U9/WuX59OWXX8ZVV10V++yzT+y6664xaNCg+OMf/7jK9a7t+bNkyZK4+eab4+CDD45ddtklDjzwwJgwYUKlx23w4MGVLr/bbrvFiSeeGLNnz86fp6rHdkVV/e4PP/xwHHLIIbHzzjvHfvvtF2PHjo3y8vLVXkfE6t8DV379r+29qLqP65NPPhnf+973Ytddd43DDjsspk+fnj+tus+z6rxGV7zMV199FYMHD44dd9wxlixZUu2xbrrpprHXXnvF+PHj13gfAtAwmQkJQMF88803sdFGG631fH/+85/jhBNOiL322ituuummWLRoUYwePTqOPfbYeOSRR6odMiOWfxiNiLj11ltj4403jojlHxJXjocREUOGDIljjjkmzj777HjkkUfirLPOivHjx0e/fv1i8eLFcdRRR8Wnn34aZ555Zmy55Zbx9NNPx0UXXRTz58+PU045JX89/fr1i9NOOy3/82abbRYREc8880y0b98+Lr744li2bFmMHDkyBg8eHL/5zW+iTZs28cc//jGGDh0axx57bJxxxhmxePHiuO++++LKK6+MnXfeOXr16pW/zvXXXz+effbZ6NOnT/64J598Mlq0WPX7xvXXXz/++Mc/RpZlkcvlIiLi97///SofjGty+yu699574+qrr44zzjgj+vTpE4sWLYrbb789zj333Ojdu3d06tQpf95x48bFpptuGhGRfzwiIn784x/HT37yk/zPV1xxRaXbuOWWW2LMmDFx1FFHxdlnnx2zZ8+O0aNHx4wZM+Khhx6q9JzI5XLx4osvRr9+/SJi+YffqVOnVrpvsiyLoUOHxquvvhpnnnlmdO/ePf7whz/E2WefHaWlpfGjH/0of9777rsvtt5667juuutiwYIFMXLkyHj//ffjgQceiFwut9bf/7TTTosjjzwyIpbPLNxxxx3zz4+tttoq/vGPf6xyn44cOTK++OKL2HDDDSsdX/HcKisri7fffjtuuOGGuOaaa2LEiBFVPjZVqcl9WaGsrCyuvfbaat9GRMTixYvjyiuvjJNOOim+//3vVzrt0ksvrTSr6Ygjjsgf/vjjj+Poo4+OXXfdNa6//vooKSmJ559/Pu68887YbLPN4uSTT46IiHPPPTemTp0ap556avTq1SumTp0aF1xwQRQXF6/1Pl+wYEEceeSR0aZNm7jkkkuiTZs2cffdd8fRRx8djzzySHTv3j0/nssvvzz69esXY8eOjVdeeSXGjRsXX3/9dfz85z+v8veePn16/OY3v6l03F133RXjx4+P8847L7p27RrXX399DBs2LB566KGIiBgxYkTcf//9MXz48OjRo0d89NFHcfPNN8ewYcPij3/8Y7Rp06ZG9/2KHn/88XjttdfWeJ5FixbFTTfdtNbrmjZtWnz00Udx4IEH1no8EVU/n8rLy+PEE0+M9957L84888zo1q1b/OpXv4qhQ4fG3XffHbvvvnv+vGt6/mRZFqecckrMmDEjTj/99Nh+++1j2rRpcdNNN8Xs2bPjqquuyp93xx13jMsuuyyWLl0ac+bMiZEjR8Z5550X999/f61+r/Hjx8eNN94YxxxzTFx44YXx5ptvxtixY2Pu3Llrff2s/B54yy23xFtvvVXpPGt7L1pZVY/rzJkz49xzz40f/ehHcckll8Q999wTp5xySvzud7+LTTbZpFq/Z3VfoyuPff78+XH33XdXufXDmp6DBx98cFxxxRXx1Vdfxfrrr1+tMQLQMIiQABTMwoUL80FuTUaOHBldu3aN8ePH5zf/7NWrVxxyyCExZcqUOProo6t9m19//XVERPTu3Tvat28fEctn4FRl8ODBMXTo0IiI2GeffWLgwIFx8803R79+/eLRRx+NWbNmxQMPPBC9e/fOn2fp0qVxyy23xJFHHhnt2rWLiOVxbeVNziOWR9hHH300unTpEhHL17saOHBgPPbYY/HTn/403nrrrRg4cGBcdNFF+cv07t079txzz5g2bVqlCLjvvvvGM888kw8h8+bNi9deey123333VWZX7r333jF16tR4/fXX8+P67W9/G3vssUel2Xc1uf0VzZ49O372s59VCq9bbrllHHbYYfHKK6/EIYcckj9+hx12iM6dO69yHZ06dap0n22wwQb5w4sWLYpbb701Dj/88Lj00kvzx2+33XZx9NFHr/KcqLhvKiLks88+G5tuummlWVAvvPBC/OlPf4obb7wxvve970XE8sfzm2++iREjRsShhx4aRUXL/9vUokWLuPPOO6Nt27YRsfzxHTp0aPzpT3+Kfffdt1q//1ZbbRURy5c
"text/plain": [
"<Figure size 1600x1200 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[1 1 1 ... 1 1 1]\n"
]
}
],
"source": [
"# Построение дендрограммы (только для числовых данных)\n",
"linkage_matrix = linkage(df_numerical_scaled, method='ward')\n",
"\n",
"plt.figure(figsize=(16, 12))\n",
"dendrogram(linkage_matrix)\n",
"plt.title('Дендрограмма агломеративной кластеризации (числовые признаки)')\n",
"plt.xlabel('Индекс образца')\n",
"plt.ylabel('Расстояние')\n",
"plt.show()\n",
"\n",
"# Получение результатов кластеризации (только для числовых данных)\n",
"result = fcluster(linkage_matrix, t=100, criterion='distance') \n",
"print(result) # Вывод результатов кластеризации (номера кластеров для каждого образца)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABJ8AAAMQCAYAAACJzMTyAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAADBOUlEQVR4nOzdeXwTdf7H8XfaJOUqh1wFigcooHIvyOEPl2tdPFfEAxQEXH+AIiKgKL/1ANlFXDkEqgiLcgmCCIrXqquiqNyXJ6goupQbodxt0nZ+f5SEpDmapJmmSV/Px4MH7Uw6+WQymcy85/v9jsUwDEMAAAAAAACACZJiXQAAAAAAAAASF+ETAAAAAAAATEP4BAAAAAAAANMQPgEAAAAAAMA0hE8AAAAAAAAwDeETAAAAAAAATEP4BAAAAAAAANMQPgEAAAAAAMA0hE8AAAAAAAAwDeETACAu9evXT/369fM7b+XKlWrcuLEeffTREq4KQKSGDBmiZcuWxboMRMnrr7+uQYMGxboMAEApQfgEAEgohw8f1oQJE2JdBoAwrFixQgcOHFCvXr1iXQqipFevXjp06JBef/31WJcCACgFCJ8AAAnlqaee0unTp1WhQoVYlwIgBNnZ2Zo0aZKGDBmipCQOTROFxWLR4MGDNWXKFGVnZ8e6HABAjPENDwBIGB9++KE++OADDR06VNWqVfOal5+fr9mzZ+tPf/qTmjZtqj//+c9auHCh12P69eunRx99VC+++KI6duyoP/zhD7rvvvu0Z88er8d99NFHuuOOO9SqVSs1bdpUPXr00KJFi9zz169fr8aNG+uLL77QnXfeqebNm+vqq6/W4sWL3Y95+umn1bhxY61bt849bcWKFWrcuLHefPNNdz2FuxZOnjxZjRs31ooVKyRJjRs31owZM7weM2PGDDVu3Nin5ptvvlnNmjXTlVdeqb///e86ffq012O2bdumu+++W61bt1b79u01cuRIHThwwOs1rV+/XpL0448/qnv37urdu3fI60WSZs+erW7duumyyy5T48aN3f8KvwZPjz76qLp27er+/eWXX1arVq20ZMkSr/Xm759rPUnSxo0b9de//lVt27ZV06ZN1bVrV82YMUP5+fnux5w8eVLjx49Xp06d1LJlS/Xq1Uuffvqp+/0I9Dye62Xw4MFq3bq1WrduraFDh2r37t3u5YeybUi+76thGOrdu7caN26szMxMSVJOTo7Gjh2rDh06qF27dnrooYd07Ngx999kZ2dr8uTJuvrqq9W0aVO1bt1aAwcO1Pbt2wOuW0nKzMz0WneFf3c9d7du3by2s59++sm9fguvn2CWL1+unJwcdenSxWv61KlT/a7rwtvKsmXLdN1116lp06bq3LmzZsyYoby8vLBeoyR9+eWXfp/P8zP46KOPql+/fnr99dfVpUsXtWrVSv3799eOHTu8lv/rr7/qgQce0JVXXqmWLVuqX79+2rx5s8/ze/5z1di4cWO98soreuSRR9SqVSt17NhR//jHP5STk+P++7y8PM2ePVvXX3+9mjdvrpYtW6p3795e+xPXfqBVq1ZyOBxe9T3wwANeXZM963nrrbe8Hrtq1Sqf9zGU55ekLl26KCcnR8uXLxcAoGyzxroAAACiISsrS+PGjdPll1+ue+65R6+99prX/LFjx2rFihUaPHiwWrVqpY0bN2rChAk6fvy4hg4d6n7cxx9/rGrVqumxxx5Tfn6+Jk+erH79+undd99V+fLl9emnn2ro0KG66667NGzYMGVnZ2vx4sV66qmn1LRpU7Vo0cK9rBEjRuimm27SkCFD9PHHH2vcuHGSpDvuuEMjRozQp59+qieffFJvv/22Dh8+rH/84x+65pprdNNNN/l9jf/97381b968sNfN22+/rYceekg33HCDHnzwQe3Zs0dTp07Vzp07NXfuXFksFn3//ffq27evWrRooX/+85/Ky8vT5MmT9de//tUdhnl69tln1bRpU917772SFNJ6efPNNzV58mQNHjxYHTp0UPny5SVJt99+e8iv5cCBA5oyZYqeeuop/fGPf/Sal5GRoZo1a0qSDh06pPvvv989b8eOHRowYIB69OihqVOnyjAMvf3228rIyFCDBg103XXXKS8vT3fffbc7OGjQoIHeeOMNDR06VPPnz9eTTz6pkydPumu+5ZZbdOutt0qSLr74Yu3atUu9e/dWgwYN9Mwzzyg3N1czZ85Unz59tHLlSlWvXt1dT7Btw5+VK1dq69atPu/Bm2++qccff1yVK1fWuHHjNHbsWE2dOlWSNHr0aG3atEkjR47U+eefr99++03Tpk3TqFGj9O6778pisYS83gubM2eOOwRzuffee2W32zV+/HjVqlVLSUlJWrZsWZHdrt566y117txZdrvda3p2dra6du2qwYMHu6cV3lZmzZqlqVOnqm/fvhozZoy2b9+uGTNmaN++fWF3v83OzlZaWpqmTZvmnuZ6Xzxt375dv/zyi0aOHKkqVapo+vTp6tu3r9577z3VqlVLO3fu1G233aYLL7xQjz32mGw2mxYsWKD+/fvr5Zdf1hVXXOG1zjp37ixJXq9/2rRpatGihZ577jn9/PPPeu6553To0CE999xzkqRJkybp1Vdf1ahRo9S4cWMdOHBAzz//vIYPH65PP/3U/dmSClogrV271v15OXXqlD777DO/rcwqVqyoTz75RDfeeKN72nvvvaekpCSvkDbU509JSVGXLl309ttv68477wzn7QAAJBjCJwBAQpgwYYKOHTuml156SVar99fbrl279Nprr2nkyJHuAXD/53/+RxaLRbNmzdIdd9zhbil15swZrVixQvXr15ckNWjQQD179tSbb76pPn36aOfOnerZs6f+9re/uZffqlUrtWvXTuvXr/cKn/70pz+5H9epUycdPHhQL7zwgvr06aNy5cpp4sSJuuOOOzR79mxt2bJFlSpV8nuy6/kaL7nkEn333XfuaUlJScrNzQ34N4ZhaNKkSerUqZMmTZrknn7hhRdqwIAB+uyzz9S5c2e9+OKLqlq1ql5++WWlpKRIkmrVqqVRo0bpp59+8lrmb7/9pi+++EJvvfWWLrnkEkkKab18/fXXqlq1qkaOHBmw3qIsWbJETZo00c033+wz79JLL1V6erok+QQjO3bsUMeOHfXss8+6T7qvvPJKffLJJ1q/fr2uu+46rV69Wl999ZWef/55de/eXZLUvn177d69W+vWrfMKsyQpLS1NLVu2dP/+5JNPqnz58po3b54qVaokSerQoYO6d++uOXPm6JFHHnE/Nti2UTgUOnXqlCZNmqTLL7/c6703DEOjR492j5O0ZcsW94DdDodDp06d0mOPPaZrr71WknTFFVfo5MmTmjhxog4fPuwO6sK1b98+/etf//Kq58iRI9q9e7cef/xx9ejRw/3Yzz//POiyTp48qW+++UbXXHONz7wzZ86obt26XuvY04kTJ/TCCy/o9ttv12OPPSap4HNdtWpVPfbYYxo4cKB7+wzFmTNnVLlyZa/nc72PhZ/3xRdfVJs2bSRJzZs3V/fu3bVgwQI99NBDysjIkN1u14IFC9x/37lzZ11//fX65z//6RXGnX/++X5f33nnnacXX3xRVqtVf/zjH5WUlKSnn35aw4YNU8OGDXXw4EGNGDHCq1VWSkqKhg0bph9++MFrmVdddZU+/vhjd/j0ySefqGbNml5hkudjP//8czkcDtntduXk5Ojjjz9W27Zt3a37JIX1/M2aNdN7772nkydP+l2fAICygW53AIC499lnn2nlypUaNGiQmjRp4jN/3bp1MgxDXbt2VW5urvtf165dlZOT49UdpnXr1u7gSZIuu+wy1a9fXxs3bpQk3XPPPZo4caJOnTqlb7/9Vu+9955mzZolST5dW3r27On1+9VXX61Dhw5p165dkgrCmQEDBuj555/XmjVrNHHiRFWpUsXva1y9erXWrFnjFWBIUvXq1d1d4/z55ZdftH//fp/X3rZtW1WqVElffvmlJGnz5s266qqr3MGTq75PPvlEl156qXva6dOnNXXqVLVr187rxD6
"text/plain": [
"<Figure size 1200x800 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Выбираем подмножество данных для кластеризации\n",
"# у меня всего два числовых столбца мне грустно.....\n",
"features = df_encoded[['Networth', 'Age']]\n",
"\n",
"scaled_features = scaler.fit_transform(features)\n",
"\n",
"# Построение дендрограммы\n",
"linkage_matrix = linkage(scaled_features, method='ward') \n",
"\n",
"plt.figure(figsize=(12, 8))\n",
"dendrogram(linkage_matrix, labels=df.index, leaf_rotation=90, leaf_font_size=10)\n",
"plt.title('Иерархическая кластеризация (дендрограмма)')\n",
"plt.xlabel('Индекс миллиардера')\n",
"plt.ylabel('Евклидово расстояние')\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Визуализация распределения кластеров**"
]
},
{
"cell_type": "code",
"execution_count": 103,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAACbMAAAuoCAYAAADST7XUAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3yfdb3//8c1PjM7adM23XtPOqHQYRkCsl0MEQVE8IiiHtGDyu/I8ciRLw7QowiuA4pAEdnILpRCoZsOukfa7DTN+Mxr/P5ImzZNUrqSNO3zfrvlRvJ+X9f7/bquz+dzlZIn77fh+76PiIiIiIiIiIiIiIiIiIiIiIiISCcyO7sAEREREREREREREREREREREREREYXZREREREREREREREREREREREREpNMpzCYiIiIiIiIiIiIiIiIiIiIiIiKdTmE2ERERERERERERERERERERERER6XQKs4mIiIiIiIiIiIiIiIiIiIiIiEinU5hNREREREREREREREREREREREREOp3CbCIiIiIiIiIiIiIiIiIiIiIiItLpFGYTERERERERERERERERERERERGRTqcwm4iIiIiIiIiInJB83z+l55fDo9dJREREREREROTkoTCbiIiIiIiIiIgck2uuuYZRo0axatWqVvvnzp3L7bfffkRjbtiwgc9//vPHo7yj8pvf/IaHHnqo6ef77ruP4cOHd1o9ramvr2f8+PGMHj2aioqKTqtj0aJFfO1rX+PMM89k/PjxnHvuudx9991UVVU1HVNcXMzw4cN58sknj+vcr776Kt/97neP65giIiIiIiIiItJ5FGYTEREREREREZFj5rou3/ve90ilUsdlvBdffJFly5Ydl7GOxi9/+Uvi8XinzX84nn32WbKyssjOzuaJJ57olBruuecerrvuOmzb5j/+4z/43e9+x5VXXskzzzzDZz7zGUpKStp1/j/96U/tPoeIiIiIiIiIiHQchdlEREREREREROSYZWVlsWHDBn796193dimnjCeffJIzzzyTs88+m8cffxzP8zp0/ueee47f//733H777fziF7/gvPPOY/r06Vx77bU8/PDDVFdX81//9V8dWpOIiIiIiIiIiHRtCrOJiIiIiIiIiMgxGzlyJJdccgkPPvggH3744cce//jjj3PBBRcwZswYZs+ezX333YfrukDjlp73338/AMOHD+eXv/wl06dP56677mo6P5VKMX78eK688spm41588cX88Ic/BCCZTPLrX/+a8847j7Fjx3LOOefwwAMPNAt9XXPNNXz729/m61//OhMmTOC6665r2k70/vvvb7G16BtvvMFFF13E2LFjOffcc3nqqafavMZnnnmG4cOHs379+mbtr7zyCsOHD2fNmjUA/PnPf26q8cwzz+TOO++kvr7+kPdv48aNrFixgtmzZ3PRRRexc+dO3nrrrRbHlZeX881vfpOpU6cyZcoUfvjDH/Lzn/+cuXPnNjvuUK9HWx544AGGDBnCtdde26JvwIABfOc732HixIn4vt+iv61tW4cPH859993X9POzzz7LRRddxLhx45g+fTrf/va3KSsrAxpfu8WLF7N48WKGDx/Oe++9B0BNTQ0//OEPOf300xk7diyf+cxnWLRoUYt57r//fi677DLGjRvH/fffj+d5TfdmzJgxzJ07l//3//4f6XT6kPdBRERERERERESOH7uzCxARERERERERkZPD97//fRYuXMj3vvc95s+fTzAYbPW43/3ud/z85z/n6quv5nvf+x5r167lvvvuo6SkhJ/85Cd8+tOfprS0lCeeeIK///3v9OzZk+Li4maBpGXLlpFIJFi1ahXJZJJQKER5eTnr1q3j1ltvxfd9brrpJpYvX87XvvY1RowYwXvvvccvfvELduzYwY9//OOmsV544QUuuugi/vd//xfP87j11lv57Gc/yxVXXMGnP/3pZrX/8Ic/5Bvf+AaFhYVNq5KNGDGCESNGtLjOefPmEY1Gee655xg2bFhT+7PPPsvQoUMZNWoUzz77LD/72c/47ne/y/Dhw9m8eTN333038Xicu+++u817PX/+fHJzc5kzZw7BYJD+/fvzt7/9jVmzZjUdk0qluPbaa4nFYnz/+98nMzOTBx54gLVr19K9e/fDfj1aU1FRwbp167j++usxDKPVYw4OGh6pJUuW8O///u/cfPPNTJkyhdLSUn72s5/xrW99i4cffpgf/ehHfOc73wHgRz/6EUOGDCGZTHLttddSWVnJN7/5TQoLC5k/fz7XX389Dz74IDNmzGga/7e//S3f+ta3GDhwIL179+b3v/89f/vb3/jud79L3759WbFiBT//+c8JBAJ8/etfP6ZrERERERERERGRw6Mwm4iIiIiIiIiIHBc5OTn853/+J1/96lf59a9/zTe/+c0Wx9TV1fGb3/yGz372s9xxxx0AzJw5k9zcXO644w6uu+46hg4dSs+ePQGYMGECALNnz+bpp5+mvLycwsJCFi1axOjRo1m9ejXLly9n2rRpvPXWW4TDYU4//XQWLFjAO++8w7333ssFF1wAwBlnnEE4HOaXv/wlX/jCFxg6dCgAgUCA/+//+/9ahO969uzZNP8+d911F2eddRYA/fr14+yzz2bx4sWthtkikQjnnnsuzz//fNO9aGho4PXXX+eWW24BYPHixfTp04errroK0zSZOnUq0WiUPXv2tHmfHcfh6aef5sILL2yq+dJLL20KoPXq1QuAp59+ms2bNzN//nzGjBkDwPTp05k3b94Rvx4HKykpAaBPnz5t1nmslixZQjgc5sYbb2y6ztzcXFatWoXv+wwZMoTMzExg//vkscceY926dTz22GOMHz8egLPOOotrrrmGe+65h/nz5zeNP3nyZK677rqmn3/6058yZswYLr/8cgCmTp1KJBIhKyur3a5RRERERERERESa0zajIiIiIiIiIiJy3MydO5eLLrqIBx98kNWrV7fo37ei2ty5c3Ecp+lr37aXCxcubHXcmTNnYlkW77zzDgDvvvsu5513HgMGDOD9998HYMGCBUyfPp1wOMzixYuxbZvzzjuv2TgXXXQR0Bgi22fQoEFtriJ3sMmTJzd9vy/IVVtb2+bxF198Mdu3b2flypUAvPrqq6RSqaY6pk+fzpYtW7jsssu4//77WbVqFZ/61Ke45ppr2hzzjTfeoLKyknnz5lFbW0ttbS1z587F8zwef/zxpuPeffdd+vbt2xRkA8jMzGTOnDlNPx/t62Hbjf+P7IFbth5vU6ZMIR6Pc+GFF/L//t//44MPPmDmzJl87Wtfa3M1uEWLFtG9e3dGjx7ddC2u6zJnzhw+/PDDZiHBkSNHNjt32rRpLFy4kCuvvJIHH3yQjRs3cvXVV3PxxRe32zWKiIiIiIiIiEhzWplNRERERERERESOqzvuuINFixY1bTd6oJqaGgBuvPHGVs8tLy9vtT0nJ4eJEyeyaNEi5s2bx6pVq7j99tvZsWMHixcvxnVdFi1axG233QbAnj17yMvLw7KsZuPs216zrq6uqS0jI+Owry0ajTZ9b5qN/5+o7/ttHj9t2jR69OjBc889x7hx43juueeYOnVq08pz559/Pp7n8de//pXf/OY33HffffTu3Ztvf/vbnH/++a2Oue+efvGLX2zR98QTT3DzzTdj2za7d++moKCgxTEHth3t69GrVy8Mw2Dnzp1tXvuePXuwbfuI7u+BJk6cyAMPPMCf/vQn/vjHP/LAAw/QrVs3brrppjbDfjU1NVRUVDB69OhW+ysqKsjJyQGav5YA119/PRkZGcyfP5977rmHn/3sZwwdOpQ77riD6dOnH9U1iIiIiIiIiIjIkVGYTUREREREREREjqucnBzuvPNObrnlFn7zm98068vOzgbgnnvuYcCAAS3O7datW5vjzpo1i4cffpgPPviAYDDImDFjKC4u5umnn2bx4sXs2bOnadWxnJwcdu/ejeu6zQJt+8JZeXl5x3qZh8U0TT71qU/x7LPPctNNN7Fw4UL+8z//s9kxF154IRdeeCF1dXW8/fbb/P73v+c73/kOp51
"text/plain": [
"<Figure size 2500x3000 with 4 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import pandas as pd\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.cluster import KMeans\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"\n",
"# Закодирование категориальных переменных\n",
"df_encoded = pd.get_dummies(df_cleaned, drop_first=True)\n",
"\n",
"# Выбор подмножества данных для кластеризации\n",
"features = df_encoded[['Networth', 'Age']]\n",
"\n",
"# Масштабирование данных\n",
"scaler = StandardScaler()\n",
"scaled_features = scaler.fit_transform(features)\n",
"\n",
"# Кластеризация данных\n",
"kmeans = KMeans(n_clusters=3)\n",
"df_encoded['Cluster'] = kmeans.fit_predict(scaled_features)\n",
"\n",
"# Визуализация кластеров\n",
"plt.figure(figsize=(25, 30))\n",
"\n",
"# Парный график 1: Networth vs Age\n",
"plt.subplot(4, 1, 1)\n",
"sns.scatterplot(x=df_encoded['Networth'], y=df_encoded['Age'], hue=df_encoded['Cluster'], palette='Set1', alpha=0.6)\n",
"plt.title('Networth vs Age Clusters')\n",
"plt.xlabel('Networth (in billions USD)')\n",
"plt.ylabel('Age')\n",
"\n",
"# Парный график 2: Networth vs Country\n",
"plt.subplot(4, 1, 2)\n",
"sns.scatterplot(x=df_encoded['Networth'], y=df_encoded['Country_United States'], hue=df_encoded['Cluster'], palette='Set1', alpha=0.6)\n",
"plt.title('Networth vs Country Clusters')\n",
"plt.xlabel('Networth (in billions USD)')\n",
"plt.ylabel('Country (United States = 1, Others = 0)')\n",
"\n",
"# Парный график 3: Age vs Industry\n",
"plt.subplot(4, 1, 3)\n",
"sns.scatterplot(x=df_encoded['Age'], y=df_encoded['Industry_Technology '], hue=df_encoded['Cluster'], palette='Set1', alpha=0.6)\n",
"plt.title('Age vs Industry Clusters')\n",
"plt.xlabel('Age')\n",
"plt.ylabel('Industry (Technology = 1, Others = 0)')\n",
"\n",
"# Парный график 4: Networth vs Source\n",
"plt.subplot(4, 1, 4)\n",
"sns.scatterplot(x=df_encoded['Networth'], y=df_encoded['Source_Amazon'], hue=df_encoded['Cluster'], palette='Set1', alpha=0.6)\n",
"plt.title('Networth vs Source Clusters')\n",
"plt.xlabel('Networth (in billions USD)')\n",
"plt.ylabel('Source (Amazon = 1, Others = 0)')\n",
"\n",
"# Настройка графиков\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## KMeans (неиерархическая кластеризация) для сравнения"
]
},
{
"cell_type": "code",
"execution_count": 107,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Центры кластеров:\n",
" [[ 4.6469914 80.60315186]\n",
" [ 3.49202201 48.5914718 ]\n",
" [80.24333333 65.36666667]\n",
" [ 3.76886463 64.24366812]]\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABjAAAASgCAYAAABWngGUAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd5wdVfn48c+027f3ks2mbnrvQEhCD5EuolSRjl/5ogL6QykqflUQlUQQKRYERYwivYcSCAkJCaSTnmzv9dYpvz8uuclmd9O3BJ7365VXsnPmznnm3Nmbe+aZc47iOI6DEEIIIYQQQgghhBBCCCFEH6L2dgBCCCGEEEIIIYQQQgghhBD7kgSGEEIIIYQQQgghhBBCCCH6HElgCCGEEEIIIYQQQgghhBCiz5EEhhBCCCGEEEIIIYQQQggh+hxJYAghhBBCCCGEEEIIIYQQos+RBIYQQgghhBBCCCGEEEIIIfocSWAIIYQQQgghhBBCCCGEEKLPkQSGEEIIIYQQQgghhBBCCCH6HElgCCGEaMdxnN4OoU/r7fbp7fqFEEIIIcSXm3wf3b/ebp/erv9YJm0nRN8kCQwhRK+79NJLufTSSztsb21t5cILL2TUqFG88cYbiX1LSkq46KKLujzezTffTElJCT/4wQ+6LebuEolE+POf/8z555/PxIkTmTJlChdddBHPPvtsuy9T8+fPp6Sk5KjWHY1G+fnPf87zzz9/VI7X1fvaE0pLSykpKeG8887DNM0O5UuXLqWkpISlS5ce0nGfeeYZfvnLXx6tMA9JZWUl11xzDWVlZYltc+bM6XPX+T//+U9KSkq47rrrejsUIYQQQnxBSH9hD+kvHB3SX+g927Zt46677uLkk09mzJgxzJo1i+9+97ts2LChV+PqzfdOCLF/ksAQQvRJra2tXHXVVWzYsIHf//73nHzyyYkyVVVZtWoVlZWVHV4XDAZZtGhRT4Z61NTW1vK1r32Nhx56iNmzZ/Ob3/yGX/3qV4nO1Y9//ONufSKkurqav/zlL51+gT8cd955J3feeedROdbhWrt2LY888shRO95DDz1EY2PjUTveofjggw945513eqXuQ7Fw4UKGDh3Ku+++S0VFRW+HI4QQQogvKOkvSH/haJD+Qs967bXXOPfcc1m7di3XX389jzzyCDfffDPbt2/nwgsv5P333++12HrzvRNC7J/e2wEIIcS+dndG1q9fz0MPPcRxxx3XrnzEiBFs3ryZV155hSuuuKJd2aJFi/B6vSQnJ/dgxEfHbbfdRmVlJU8//TTFxcWJ7bNmzSI/P5/777+f2bNnc9JJJ/VekIdg8ODBvR0CycnJiQ7tkCFDejucL7wtW7awatUqHn30UW6++Waefvpp/vd//7e3wxJCCCHEF4z0F6S/cLRIf6Hn7Ny5k9tuu40TTjiB3/72t2ialig79dRT+frXv85tt93GW2+9hcvl6sVIhRB9jYzAEEL0KW1tbVx99dVs3LiRP/7xjx06IwA+n48TTzyRV155pUPZSy+9xGmnnYaut8/P2rbNH//4R0455RRGjRrFaaedxhNPPNFuH8uy+OMf/8i8efMYM2YM48aN46KLLuLDDz9M7DN//nxOOeUU3n77bb7yla8kjvXss8+2O9Zf/vIXTj/9dEaPHs0JJ5zAXXfdRWtra5fnvX79ehYvXsy3vvWtdp2R3a644gouvvhifD5fp6/vbGjwv//9b0pKSigtLQUgHA5z1113MXPmTEaNGsXpp5/OY489BsSHUO/u6Pzwhz9kzpw5ieMsX76cSy65hLFjxzJlyhRuu+026uvr29UzYsQInnnmGY477jimTJnC5s2bOwwJLykp4cknn+T2229nypQpjB8/nptuuona2tp2cT/22GOcdNJJjBkzhosuuoi33nqr3fDt3cO958+f32V77nbttdcSCAT4wQ9+gGVZ+923sbGRO+64gxkzZjB69GguvPBClixZ0q6Ny8rK+M9//kNJSQl//etfKSkpYd26dYl9nn32WUpKSnjmmWcS29avX09JSQkrV64EYPv27XznO9/huOOOY9y4cVx66aWsWLEisf/u8/vTn/7E6aefztixY1m4cCE//OEPATjppJPavdexWIxf/epXieNdeeWV7Nixo8vzvPLKKznvvPM6bL/hhhs466yzAKivr+d73/sexx13HKNHj+bss8/ucI13ZuHChaSkpDBt2jROO+00/vWvf3X6hN7bb7/Neeedx5gxYzjttNN44YUXOOWUU9q9pwd6P4QQQgjx5ST9BekvSH/h2OwvPPHEE0SjUX70ox+1S14AeL1ebrvtNs4//3yampoS21966SXOO+88xo8fz3HHHccdd9zRrvwHP/hBu2tx7/b597//DeyZDmzJkiVceeWVjB07luOOO45777038Z7v+96VlpZ2et0++eSTlJSUsG3btnZ1/ve//2X48OEyAl2IbiIJDCFEnxEMBrnmmmtYt24djzzyCFOnTu1y37lz53YYFt7a2sq7777LvHnzOux/11138cADD3DWWWfxhz/8gdNPP52f//zn/P73v0/sc9999/Hggw/yta99jUcffZSf/vSnNDY2ctNNNxEKhRL71dTU8JOf/ITLLruMP/7xjxQWFnLbbbexZcsWAF544QXuvfdeLr74Yh577DFuvPFG/vvf//LTn/60y/N57733ADp8+drN7XZzxx13MH369C6PcSA///nPeffdd7ntttsSX/p/9atfsXDhQrKzs1mwYAEA119/feLfH330EVdccQUej4ff/va3/L//9/9YtmwZl112GeFwOHFsy7J4/PHHueeee/jhD3/IoEGDOo3hN7/5DbZtc//993PrrbeyaNEifv7znyfKFyxYwH333ccZZ5zBgw8+yNixYzs8wZ+dnc3TTz/NV7/61QOec3p6OnfccQdr1qzh0Ucf7XK/SCTC5ZdfzptvvsnNN9/MggULyM3N5aqrrkp0ShYsWEBWVhYnnngiTz/9NOeeey4ul4sPPvggcZzdndfly5cntr377rukp6czduxYNm/ezHnnnUdpaSk/+tGPuO+++1AUhcsvv5xly5a1i2n+/PlcffXV/OpXv2LGjBlcf/31iThuuOGGxH4vvfQSmzZt4he/+AV33nkna9as4eabb+7yXM866yzWrl3brtPS3NzMu+++y9lnnw3ALbfcwpYtW7j77rt55JFHGDFiBLfddlu7zvm+TNPkueeeY968eRiGwbnnnktNTQ1vvfVWu/0+/PBDbrjhBvLy8pg/fz4XX3wxd955Z7sv+wfzfgghhBDiy0f6C9JfkP7CHsdaf+G9995jxIgR5OTkdFo+ffp0br75ZrKysgB48MEH+e53v8u4ceN44IEHuPHGG3n11Ve59NJL211bB+v73/8+EydO5A9/+APz5s3j0UcfTSSS9n3vsrOzgY7X7bx583C73fz3v/9td+xnn32W6dOnk5eXd8hxCSEOTKaQEkL0Cbs7I7ufLAkGg/vdf9asWXi93nbDwl9//XUyMjKYOHFiu323bdvGP//5T7773e9yzTXXAHD88cejKAoPP/ww3/jGN0hLS6O6upqbb7653VNAbreb//mf/2Hjxo2MGzcOgFAoxD333JPoHBQXFzN79mzeeecdBg0axLJlyygsLOTiiy9GVVWmTJmCz+dr96TIvnbfvC0sLDz4RjtEy5Yt47jjjuPMM88EYOrUqfh8PjIyMnC5XAwfPhyAoqIiRowYAcCvf/1rBgwYwMMPP5x4Smbs2LGceeaZLFy4kIsvvjhx/Ouuu45Zs2btN4ahQ4fyf//3f4mfP/3008STccFgkEceeYSLL76Y73//+0D8fQqFQjz99NOJ17hcrsR7cTDmzp3Lyy+/zIIFC5gzZ06nQ8P/+9//smHDBv75z38yduxYAGbOnMmll17Kfffdx8KFCxkxYgQul4v09PRE/VOmTGHJkiVcddVVACxZsoSRI0fy0UcfJY793nvvceKJJ6KqKgsWLMDlcvHXv/6VQCAAxK/lefPm8atf/Yp//etfidedccYZnH/++Ymfi4qKABg+fHi
"text/plain": [
"<Figure size 1600x1200 with 4 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df = pd.read_csv(\"..//static//csv//Forbes Billionaires.csv\")\n",
"\n",
"# Удаление несущественных столбцов\n",
"columns_to_drop = ['Rank ', 'Name']\n",
"df_cleaned = df.drop(columns=columns_to_drop)\n",
"\n",
"# Закодирование категориальных переменных\n",
"df_encoded = pd.get_dummies(df_cleaned, drop_first=True)\n",
"\n",
"# Выбор подмножества данных для кластеризации\n",
"features_used = ['Networth', 'Age']\n",
"data_to_scale = df_encoded[features_used]\n",
"\n",
"# Масштабирование данных\n",
"scaler = StandardScaler()\n",
"data_scaled = scaler.fit_transform(data_to_scale)\n",
"\n",
"# Кластеризация данных\n",
"random_state = 42\n",
"kmeans = KMeans(n_clusters=4, random_state=random_state)\n",
"labels = kmeans.fit_predict(data_scaled)\n",
"centers = kmeans.cluster_centers_\n",
"\n",
"# Отображение центроидов\n",
"centers_original = scaler.inverse_transform(centers) # Обратная стандартизация\n",
"print(\"Центры кластеров:\\n\", centers_original)\n",
"\n",
"# Визуализация результатов кластеризации KMeans\n",
"plt.figure(figsize=(16, 12))\n",
"\n",
"# Парный график 1: Networth vs Age\n",
"plt.subplot(2, 2, 1)\n",
"sns.scatterplot(x=df_cleaned['Networth'], y=df_cleaned['Age'], hue=labels, palette='Set1', alpha=0.6)\n",
"plt.scatter(centers_original[:, 0], centers_original[:, 1], s=300, c='red', label='Centroids')\n",
"plt.title('KMeans Clustering: Networth vs Age')\n",
"plt.legend()\n",
"\n",
"# Парный график 2: Networth vs Country\n",
"plt.subplot(2, 2, 2)\n",
"if 'Country_United States' in df_encoded.columns:\n",
" sns.scatterplot(x=df_cleaned['Networth'], y=df_encoded['Country_United States'], hue=labels, palette='Set1', alpha=0.6)\n",
" plt.title('KMeans Clustering: Networth vs Country')\n",
" plt.xlabel('Networth (in billions USD)')\n",
" plt.ylabel('Country (United States = 1, Others = 0)')\n",
"else:\n",
" plt.title('KMeans Clustering: Networth vs Country (No Data)')\n",
" plt.xlabel('Networth (in billions USD)')\n",
" plt.ylabel('Country')\n",
"\n",
"# Парный график 3: Age vs Industry\n",
"plt.subplot(2, 2, 3)\n",
"if 'Industry_Technology' in df_encoded.columns:\n",
" sns.scatterplot(x=df_cleaned['Age'], y=df_encoded['Industry_Technology'], hue=labels, palette='Set1', alpha=0.6)\n",
" plt.title('KMeans Clustering: Age vs Industry')\n",
" plt.xlabel('Age')\n",
" plt.ylabel('Industry (Technology = 1, Others = 0)')\n",
"else:\n",
" plt.title('KMeans Clustering: Age vs Industry (No Data)')\n",
" plt.xlabel('Age')\n",
" plt.ylabel('Industry')\n",
"\n",
"# Парный график 4: Networth vs Source\n",
"plt.subplot(2, 2, 4)\n",
"if 'Source_Amazon' in df_encoded.columns:\n",
" sns.scatterplot(x=df_cleaned['Networth'], y=df_encoded['Source_Amazon'], hue=labels, palette='Set1', alpha=0.6)\n",
" plt.title('KMeans Clustering: Networth vs Source')\n",
" plt.xlabel('Networth (in billions USD)')\n",
" plt.ylabel('Source (Amazon = 1, Others = 0)')\n",
"else:\n",
" plt.title('KMeans Clustering: Networth vs Source (No Data)')\n",
" plt.xlabel('Networth (in billions USD)')\n",
" plt.ylabel('Source')\n",
"\n",
"# Настройка графиков\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### PCA для визуализации сокращенной размерности"
]
},
{
"cell_type": "code",
"execution_count": 108,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABi8AAAJHCAYAAADoqsXxAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd5xU1f3/8fedO3V7LywsHaR3EBUV7D22b0zssaCYmGiMJdYkvyQmsddoojFREjFR0dhL7CKKAiq9921s35mdnXJ/fxAmLLsLC8xO2X09H488DPfM3vueOTswZz73nGNYlmUJAAAAAAAAAAAgQdjiHQAAAAAAAAAAAGBXFC8AAAAAAAAAAEBCoXgBAAAAAAAAAAASCsULAAAAAAAAAACQUCheAAAAAAAAAACAhELxAgAAAAAAAAAAJBSKFwAAAAAAAAAAIKFQvAAAAAAAAAAAAAmF4gUAdAOWZcU7AjrQk/umJz93AAAAtMZnw56BfgYQTRQvgG7s/PPP19ChQ1v9b+TIkTryyCP1i1/8QnV1dW1+Zt26dbrjjjt09NFHa/To0TryyCN17bXXavny5R1e595779XQoUP1q1/9qiufTocefPBBDR06NC7Xbs8LL7ygoUOHavPmzV3+cy0tLfrNb36jf//73/sac5+cc845Gjp0qN58880uvU6i9eWBqK+v1/XXX68FCxZEjp1//vk6//zzY5ahs+/nGTNm6MYbb4zqtVetWqXvfe97UTnX5s2bNXToUL3wwgtROR8AAEgcjFniozuNWYYOHaoHH3ywzfGVK1dq6tSpOuKII7R+/frIY4cOHap77rmn3XOFw2FNmzYtaT97lpeX6/e//72OP/54jRkzRocddpiuuOKKVmMSqWvGJWVlZbr88su1ZcuWqJyvo34F0LNQvAC6ueHDh2vOnDmR//3lL3/RRRddpOeff14zZ85sdVfEW2+9pdNPP11LlizRlVdeqT/96U+65pprtH79ev3f//2fPvnkkzbnD4fDmjt3roYMGaKXXnpJPp8vlk+vx6uoqNBf//pXBYPBLrvG2rVrtXDhQg0ZMkTPPvtsl12nu1m2bJleeuklhcPhyLHbb79dt99+e0yuvz/v52h64403tHDhwqicq6CgQHPmzNGRRx4ZlfMBAIDEwpile4vFmGV3q1at0kUXXSSPx6NnnnlG/fr1i7TZbDa98cYb7f7cF198oYqKihiljK4vv/xSp512mt577z1dcMEF+uMf/6ibb75Zzc3NOv/88zV37twuvf6nn36qDz74IGrnmzNnjs4+++yonQ9AcrLHOwCArpWWlqaxY8e2OjZp0iQ1NTXpgQce0OLFizV27Fht3LhRN9xwg6ZNm6b77rtPpmlGHn/sscfqe9/7nm644Qb95z//kdPpjLR9/PHHKisr0z333KPzzjtPr7zyCh8wupkXXnhBJSUlmjlzpq677jpt2LBBffv2jXespDRo0KCYXGd/38+Jyul0tvl7DAAAdB+MWRBNa9as0YUXXqjU1FT99a9/Va9evVq1jx8/XgsWLNDSpUs1fPjwVm2vvvqqhg0bpmXLlsUy8gGrra3VT37yE/Xr109/+ctf5PF4Im3HHXecLr/8ct1222067LDDlJeXF8ekncfnfwASMy+AHmvkyJGSpK1bt0qSnn76abW0tOiWW25pNQiQJI/HoxtuuEFnnnlmm2nbzz//vIYMGaIJEyZoypQpmjNnzl6vPWPGDP3mN7/RhRdeqNGjR+vmm2+WtOMD12233aZDDjlEo0aN0v/93/9p3rx5rX7W7/frt7/9rQ499FCNGzdON910k/x+f6vHtDcFdv78+Ro6dKjmz58fObZ27Vr98Ic/1OTJkzVp0iTNnDlTa9asaXWt3//+9zriiCM0cuRInXLKKXrttddanTccDuuRRx7RkUceqTFjxmjWrFntTm3fXWd/7p133tH3v/99jRs3TiNHjtTxxx+v2bNnS9qxlM5RRx0lSbrppps0Y8aMyM/985//1BlnnKGxY8dq9OjROu200/T666+3OvfQoUP3ulRQKBTS3LlzNX36dB199NFKSUlpt48DgYDuuusuHX744Ro9erQuueQSzZ07t82U8hdffFEnnniiRo0apVNPPVXz5s3T8OHD9zgl+7XXXtMZZ5yhcePG6dBDD9Vtt93W6rV68MEHdfzxx+vtt9/WySefrFGjRum0007TwoULtWjRIp199tkaPXq0Tj755Da/TytXrtTMmTM1fvx4jR8/XldddZU2bdoUad/5e/Pss89q+vTpGj9+fORuvj29xvPnz9cFF1wgSbrgggsiv4+7/m7+4Ac/0BlnnNHm+c6aNUunnnpq5M8LFizQeeedpzFjxmjy5Mm64YYbVF1d3eHrJe3/+3nX57zre2X37JL07bff6sILL9SECRM0btw4XXTRRVq0aJGkHX3y0EMPSWo93TscDuvxxx/XMccco5EjR+q4447T008/3eY61113na6++mqNHTtWF198cZtlo1544QUNHz5cixcv1ne/+12NGjVK06dP1xNPPNHqXBUVFbrmmmsi7/HbbrtN9957b6v3CgAASFyMWRizdGbMsqs1a9boggsuUHp6up555pk2hQtpR2EsLy+vzeyLYDCot956SyeddFKbn+lMv1dXV+sXv/iFpk+frpEjR2ry5Mm66qqrWo2Hzj//fN188816/PHHdeSRR2rUqFE655xz9PXXX0ce09zcrDvuuEOHH3545PXc/XPu7ubOnauKigr9/Oc/b1W4kHbMNLnuuut07rnnqrGxsc3PdrRE64033tiqvzZu3KgrrrhCU6ZM0ZgxY/Td7343MtPihRde0E033SRJOuqoo1r12T//+U+ddNJJkeXgHnzwQYVCoVbXufDCC3X77bdr/PjxOvHEExUKhVqNI3a+N+bNm6cf/OAHGjNmjA499FD94Q9/aHWuxsZG3XbbbZo6darGjRuna665Rk899VRCLdkGYN9QvAB6qHXr1kmS+vTpI0n66KOPNHz4cBUWFrb7+KlTp+qaa65Rfn5+5Fhtba3+85//6Dvf+Y4k6fTTT9c333yjJUuW7PX6s2fP1qhRo/TII4/orLPOkt/v14UXXqh3331X11xzjR566CEVFRXp0ksvbfWh8Gc/+5mee+45zZw5U/fdd5/q6ur01FNP7fPzLy8v13e/+12tX79ed9xxh/7whz+oqqpKF154oWpra2VZlq666io9++yzuvjii/Xoo49GPvzsOt32D3/4gx5++GGdddZZeuihh5SVlaW77757r9fvzM+9//77uuqqqzRixAg98sgjevDBB9WnTx/98pe/1OLFi1VQUBD5gvjKK6+M/P/Zs2frtttu09FHH63HHntMd911l5xOp6677jqVlZVFzj9nzhzNmjVrjzk//PBDVVZW6jvf+Y7cbrdOOOEEvfjii2ppaWn1uNtuu01//etfdd555+nhhx9WXl6ebr311laPmTt3rm688UaNHz9ejzzyiI477jjNmjWr1YfN3T3yyCO69tprNXbsWD3wwAO66qqr9Oabb+r8889Xc3Nz5HFlZWW68847dcUVV+j+++9XfX29rr76al177bU6++yz9fDDD8uyLF1zzTWRn1u3bp3OOeccbd++Xb/73e/061//Wps2bdL3vvc9bd++vVWOhx56SDfccINuu+02jRs3bq+v8YgRI3TbbbdFXpv2loo69dRTtWTJEm3YsCFyrL6+Xh9++KFOO+00STumrV900UVyu92677779POf/1yff/65LrjgglbPf3f7837eF42Njbr00kuVnZ2tBx98UPfee698Pp8uueQSNTQ06Oyzz9ZZZ50lqfV07zvuuEMPPPCATj31VP3xj3/U8ccfr9/85jd6+OGHW53/9ddfV2pqqh599FFdeuml7WYIh8P6yU9+ohNPPFGPP/64xo8fr9///vf66KOPJO1YW/nCCy/UV199pZ///Of67W9/q+XLl+vJJ5/cr+cMAABijzELY5bOjFl2Wrt2rS688EKlpaXpmWee6fD3xDRNHXfccW2KF/PmzZPf729zo0t
"text/plain": [
"<Figure size 1600x600 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"pca = PCA(n_components=2)\n",
"reduced_data = pca.fit_transform(data_scaled)\n",
"\n",
"# Визуализация сокращенных данных\n",
"plt.figure(figsize=(16, 6))\n",
"plt.subplot(1, 2, 1)\n",
"sns.scatterplot(x=reduced_data[:, 0], y=reduced_data[:, 1], hue=result, palette='Set1', alpha=0.6)\n",
"plt.title('PCA reduced data: Agglomerative Clustering')\n",
"\n",
"plt.subplot(1, 2, 2)\n",
"sns.scatterplot(x=reduced_data[:, 0], y=reduced_data[:, 1], hue=labels, palette='Set1', alpha=0.6)\n",
"plt.title('PCA reduced data: KMeans Clustering')\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Анализ инерции для метода локтя (метод оценки суммы квадратов расстояний)"
]
},
{
"cell_type": "code",
"execution_count": 109,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA14AAAImCAYAAABD3lvqAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAB04UlEQVR4nO3dd3xUVf7/8fdMeocUUmgJBEJJQg0I0kUERJeiYgHLV2yLy0/RRVlZd1FRV0EEEVcWGwoCCmJBKSIqIiBVWugEAySEJKRAejK/P0JGhoQkhCQ35fV8PPIIuffcM58Zzq55c+4512SxWCwCAAAAAFQZs9EFAAAAAEBdR/ACAAAAgCpG8AIAAACAKkbwAgAAAIAqRvACAAAAgCpG8AIAAACAKkbwAgAAAIAqRvACAAAAgCpG8AIAAACAKkbwAgAAAIAqRvACgEowduxYhYWF6c4777ximyeffFJhYWF69tlnq7EyABV18uRJhYWFafny5UaXAqAOIHgBQCUxm83atWuX4uPji53LyMjQ+vXrDagKAADUBAQvAKgk7dq1k5OTk1atWlXs3Pr16+Xi4iJ/f38DKgMAAEYjeAFAJXF1dVXfvn1LDF7ffvutbrrpJtnb2xc79/3332vkyJGKiIjQ9ddfr5deekkZGRmSpAEDBigsLKzEr5MnT0qSNm7cqLvvvltdunRR9+7d9dRTTykuLs7mNZ566qkS+yjrFqqiWyhL+rrUnj179OCDD6p79+7q3LmzHn30UR0+fNh6fsuWLQoLC9OWLVskSYcOHdLAgQN155136q233rria7z11luSpM8++0xDhgxReHi4zfmybttcunRpif1eel3R7WRltatoDeX9bEp7/SudL/p7ePbZZzVgwACb1128eLHNZ3jp62zfvt2m7SeffKKwsDCbPrKysjRjxgwNGjRI4eHh6ty5sx544AFFR0fbXHulusaOHWvTpqiOklw+PoqMHTvWpp/s7Gy9/fbbGjx4sCIiIjRo0CDNmzdPBQUFNtdcXsuWLVvKdW1ZLBaLJk+erMjISP3yyy/lvg4AJKn4bwAAgAobOnSonnjiCcXHxysgIECSdP78ef3888/64IMP9PPPP9u0//rrr/X000/rlltu0RNPPKFTp05p5syZOnLkiD744APNmTNHOTk5Onv2rB5//HE99thj6tevnySpUaNGWrFihZ555hkNGzZMjzzyiM6dO6fZs2dr9OjR+uKLL+Tj4yOp8BfW0aNHa+TIkZJk7a882rVrp3/961/Wnz/77DN9/vnn1p83b96scePGqXv37nr55ZeVnZ2td999V3feeaeWLl2qli1bFuvz9ddfV3h4uB577DF5eXmpd+/ekqSpU6dKkvX1AgICtHXrVk2ZMkW33XabpkyZIjc3N0kqV/1ZWVmKiIjQlClTrMeudN2ln+3l7Spaw9V8Ns8//7zat29f4usvWbJEkrRv3z698MILxdpeLjU1VW+++WaJ59zc3PTDDz+oS5cu1mPffvutzGbbf4udNGmStm3bpokTJ6pZs2Y6ceKEZs2apaeeekorV66UyWSytr3tttt0++23W38u+nusTBaLRY8++qh27dqlxx9/XG3atNGWLVv05ptvKjY2Vi+++KK17eVjtmXLluW+tjQvvfSSvvnmG7399tvq1atXpb9HAHUbwQsAKlG/fv3k4uKiVatW6f7775ckrV27Vj4+Pja/6EqFv0hOnz5dvXv31vTp063Hg4ODdf/99+unn36yBoGi2a1mzZqpY8eOkqSCggJNnz5dvXr10owZM6zXd+7cWUOHDtV7772nSZMmSZIyMzMVHBxsvbaov/Jwd3e3XidJGzZssDk/Y8YMNW/eXPPmzZOdnZ0kqVevXrrxxhs1e/ZszZo1y6b9iRMn9Msvv+irr75Sq1atJMkaUt3d3SXJ5vVWrlwpSfrHP/5hDTyS5OjoWGbtmZmZ8vX1tenvStdd+tle3m737t0VquFqPpvQ0NArvn7R8ezs7BLbXm727NkKCgrSuXPnip3r06eP1q1bp7///e+SpPj4eO3cuVNdu3bVqVOnJEk5OTm6cOGCpkyZoqFDh0qSunXrpvPnz+vVV19VYmKi/Pz8rH0GBATY1FP091iZfv75Z/3666964403dPPNN0uSrr/+ejk7O2vWrFm69957rePp8jH7008/lfvaK5kxY4aWLFmiOXPmqE+fPpX+/gDUfdxqCACVyNnZWQMGDLC53XDlypUaMmSIzQyBJB07dkzx8fEaMGCA8vLyrF9RUVFyd3fXxo0bS32t48eP6+zZsxo2bJjN8WbNmqlTp0767bffrMfi4uLk4eFRCe/QVkZGhvbs2aMhQ4ZYg4UkeXp6qn///jY1FLWfOXOmunfvXuYvukUiIyMlSe+//74SEhKUk5OjvLy8cl1bWe+7IjVc7WdTWQ4dOqQlS5bon//8Z4nnBwwYoJiYGB07dkyStGrVKnXo0EGNGze2tnF0dNR7772noUOH6syZM9q8ebMWL15s3SAmJyfnqusqKChQXl6eLBZLmW2Kvi5t+9tvv8ne3l6DBw+2uebWW2+1nr+Sa7lWkhYuXKh58+bp5ptvtpkVBYCrwYwXAFSyIUOG6PHHH1d8fLycnJy0adMmPfHEE8XapaSkSCq8LaukW7MSEhJKfZ2i6319fYud8/X11f79+yUVzqydPn1aTZo0ubo3Ug7p6emyWCxXrCE9Pd3m2KOPPipPT0+bWxXLEhUVpSlTpmjevHmaM2fOVdV36tSpUm/Jq8oarvazqSwvvfSSbr75ZnXq1KnE8/7+/goPD9e6devUokULffvttxo2bJh1vBTZsGGDXn75ZR07dkxubm5q06aNXF1dJanU8HQlc+fO1dy5c2VnZydfX1/16tVL/+///T+bDWeKZokv1a1bN0mFt082bNjQJsRKss68lfZ5Xsu1knTgwAH16tVL33zzje677z61a9eu1PYAUBKCFwBUsj59+sjNzU2rVq2Sq6urmjRpovDw8GLtPD09JRWupSn65fJSXl5epb5OgwYNJEmJiYnFzp09e1YNGzaUJEVHRysrK6vYhhiVwcPDQyaT6Yo1FNVYZNKkSVq1apUmTJighQsXlvuWtDvuuEO//PKL8vLy9Pzzz6tJkyZ67LHHSr2moKBAv//+u0aNGlWu17h8RvJaa7jaz6YyfPfdd9q7d6/NraclueGGG7Ru3ToNGTJEe/fu1Zw5c2yC1x9//KHx48dr4MCBevfdd9W0aVOZTCYtXLiw2K2mUtmfnVT4+d1xxx0qKCjQ6dOnNXPmTD300EP66quvrG2mTp1qE5QvXafl5eWlc+fOKT8/3yZAFf0DRdF4L8m1XCtJ/+///T/de++9uvnmmzVlyhR99tlnxUIcAJSFWw0BoJI5Ojpq4MCBWr16tb777jvrmpLLtWjRQj4+Pjp58qQiIiKsX/7+/poxY0axGYjLhYSEyM/PT998843N8djYWO3atUudO3eWJP34449q27atvL29r/q9FBQUlPoLpqurq8LDw/Xdd98pPz/fejw9PV0//vhjsXVt4eHhmjNnjk6dOqXXX3+93HXMmjVLP/74o1599VUNGTJEERERZa6v2rFjhzIyMtS9e/dS2xXN3ly+ucS11nC1n821ysnJ0Wuvvabx48fbrL8qycCBA/X777/rk08+UZcuXdSoUSOb83v37lV2drYefvhhNWvWzBqsikJX0WdWtCNgWZ+dVLgZTEREhDp06KAhQ4bonnvu0cGDB5WammptExISYvO/hUvX03Xr1k15eXnFdg0tCm6lfZ7Xcq1UOEPp7Oys559/Xvv27dMHH3xQ5vsFgMsx4wUAVWDo0KF65JFHZDabbXbUu5SdnZ2efPJJPf/887Kzs1P//v2VlpamuXPn6syZM2XeImc2mzVx4kRNnjxZTz31lG699VadO3dOc+bMkZeXlx544AHt27dPCxcu1M0336xdu3ZZrz179qykwpmN5OTkYqEsOTlZR44c0YkTJ6wB7kqeeuopPfjgg3r44Yd19913Kzc3V/PmzVNOTo7Gjx9frL2/v7+eeOI
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"inertias = []\n",
"clusters_range = range(1, 11)\n",
"for i in clusters_range:\n",
" kmeans = KMeans(n_clusters=i, random_state=random_state)\n",
" kmeans.fit(data_scaled)\n",
" inertias.append(kmeans.inertia_)\n",
"\n",
"\n",
"plt.figure(figsize=(10, 6))\n",
"plt.plot(clusters_range, inertias, marker='o')\n",
"plt.title('Метод локтя для оптимального k')\n",
"plt.xlabel('Количество кластеров')\n",
"plt.ylabel('Инерция')\n",
"plt.grid(True)\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Расчет коэффициентов силуэта"
]
},
{
"cell_type": "code",
"execution_count": 110,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA1oAAAImCAYAAABKNfuQAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAC2wklEQVR4nOzdeVTVdf7H8eflsm+yiCJuuAFugOa+pKlZqW22j2abluWMzZTUNDWtNjVpNVljamplZTVpmaWV2Z57mqDihooLgmwKssO99/cHwS/CBfBevhd4Pc7xqN/tvu5HVN73s5lsNpsNERERERERsRsXowOIiIiIiIg0Niq0RERERERE7EyFloiIiIiIiJ2p0BIREREREbEzFVoiIiIiIiJ2pkJLRERERETEzlRoiYiIiIiI2JkKLRERERERETtToSUiIiIiImJnKrREpMm49dZbufXWW6sc++WXX7jqqqvo1q0bH330kUNf/+9//zsjRoyo9X0jRozg73//uwMSiYijREZG8uqrrxodQ0QM5Gp0ABERo2RlZTF16lS6d+/OokWLiIyMNDqSiIiINBIqtESkyXrzzTcpKirihRdeoGXLlkbHERERkUZEQwdFpEk6efIkS5cu5corr6xWZCUnJzN9+nQGDx5MbGwst956K1u3bq1yzffff8/48eOJiYlh0KBBPPHEE5w+fbrKNe+99x6XXHIJMTEx/O1vfyMvLw+A119/nYEDB9KnTx+eeOIJSkpKKu8pKSnhqaeeom/fvvTv379y6FF+fj5xcXHExsYybNgw3nvvvcp7jh07RmRkJB9//HHlseLiYkaOHFmll+5MQyc3bdpEZGQkmzZtOuPvobznr0+fPtWGPX700UeMHTuWHj16MHz4cF599VUsFkvl+TMNlfx91orXOtOPipznGzZ5pvf0R+np6Tz88MMMHDiQXr16MXHiRH799dfK838c4mWz2bj55puJjIzk2LFjVa47V9bp06dz8cUXY7Vaq7z+o48+ymWXXQZAWloaDzzwAAMGDCAmJoZbb72V7du3A/Dqq6+e9TUq8u3Zs4c///nPDBgwgO7duzN06FBmzpxJUVHROdtg3bp158xe0/cI8M0333DttdcSExNzzmf93scff0xkZCTx8fFce+21REdHc+WVV/Lll19Wue7YsWM89NBDDBkyhO7duzNw4EAeeughTp48WXnN7t27mTBhAr169WLUqFF88MEHlefO9PUL1b9Ozjes7/dfd0uWLKn292vjxo1ERUXx3//+96zP+KM5c+bQtWtXPvnkkxrfIyINm3q0RKRJsdlspKamMnPmTMrKyrjnnnuqnE9KSuLGG28kPDycxx57DDc3N5YsWcJtt93G4sWL6devH1u2bOHee+/lqquu4sEHH2T//v385z//Yd++fbz77ruYzWbWrl3L008/za233srFF1/Mhx9+yNq1awFYvXo1M2fOJCUlhdmzZ+Pp6ckjjzwCwKxZs1i+fDkPPfQQoaGhvPzyy6SkpJCSksLll1/OnDlz+PHHH3n66acJDQ1l5MiRZ3yfCxcurFIkXIgXX3yR06dP4+/vX3ls/vz5vPzyy0ycOJFHHnmE3bt38+qrr5Kamsq//vWvGj23e/fufPjhh0B50bZs2bLK3/v6+tole35+PrfccgsWi4W4uDhatmzJ4sWLufPOO/nkk08IDw+vds+nn35apRD7veuvv54bbrih8vdPPfVUlXNfffUVmzZtYuDAgQAUFRXx5ZdfMmXKFEpKSpg8eTKlpaU88cQTuLm5MXfuXG699Vb+97//ccMNNzB06NAqz33iiScACA0NJT09nQkTJhAbG8vzzz+Pu7s7P/74I2+++SYtWrTg7rvvPms7FBUVERoayiuvvHLG7DV9j0eOHOH+++9n6NCh/O1vf6v8mjjbs/7onnvuYeLEifztb39j2bJl/PWvf2X+/PkMGzaMwsJCJk2aRGBgIE888QR+fn78+uuvvPbaa3h6evL0009TWFjIlClTaN26Na+++irbtm3jiSeeICwsjIsvvrhGGWrr1ltvZc2aNfz73/9m+PDhuLu7849//IPY2FimTp1ao2csWrSIuXPnMnPmTK699lqH5BQR56NCS0SalC1btjB8+HDc3Nx44403qn2j/dprr+Hu7s6SJUsqv9kfPnw448aN44UXXmDZsmWsWLGC8PBwnnvuOVxcXBg8eDBeXl48/vjj/PDDD4wYMYJ58+bRv39/HnvsMQD69+/P4MGDOX36NM899xw9evQAIDc3lzfeeIP77rsPq9XKhx9+yN13383EiRMBaN68OTfddBMBAQHMnj0bNzc3Lr74Yvbt28f8+fPPWGilpqbyxhtv0L17d3bt2nVB7bVjxw4+/fRTunbtSm5uLgCnT59m7ty53HTTTZXvb8iQIQQEBPDYY49xxx130KVLl/M+29fXl9jYWAB++ukngMrf28snn3xCSkoKn3zyCV27dgWgd+/eXHPNNWzZsqXan39+fj6zZ88+a9uFhoZWyfj7gnDIkCGEhoayYsWKykLr66+/pqCggGuuuYbt27dz8OBB3nvvPXr16lWZ5dJLL2Xu3Lm8+uqrhIaGVnnu71/r559/pmvXrrzyyiuV5wcNGsS6devYtGnTOQutwsJC/P39z5q9pu8xMTGR0tJS/va3vxEREXHeZ/3RrbfeyrRp0wAYOnQo1157Lf/9738ZNmwYycnJhIaG8u9//5u2bdsCMGDAAOLj49m8eTMAKSkp9OzZk3/84x+0bduWIUOGsHTpUn766SeHFVomk4nnnnuOq666ilmzZmE2mzl16hRvv/02ZrP5vPe///77zJo1i6effprrr7/eIRlFxDlp6KCINCndunXj+eefp1mzZjzyyCPVen02b97MJZdcUuUbR1dXV8aOHcvOnTvJz8/n2WefZcWKFbi4uFBWVkZZWRmXXXYZLi4ubNmyhbKyMhITExkyZEjlMzw8PIiJicHLy6uyyILyb86LiorYu3cve/fupbi4uLJXA8q/0fbw8CA6Oho3N7cq9+3atavKUL0K//73v+nTpw+XXHLJBbWVzWZj5syZXH/99URFRVUe//XXXykqKmLEiBGV77+srKxymOC6deuqPOf31/xxWF1Nc9T13q1bt9KmTZvKIgvAy8uLr776qkqvTYW5c+cSGBjILbfcUuvXcnFx4dprr2XNmjUUFhYC5YXeoEGDCA0NpV+/fmzfvp3Y2FgsFgtlZWX4+/szePBgtmzZct7nDxkyhHfffRcPDw+SkpL45ptveP3118nOzq4y/PRMUlNT8fPzq/V7+qPu3bvj6urKu+++S0pKCiUlJZSVlWGz2Wp0/+97c0wmE5deeikJCQkUFRXRtWtXli5dSuvWrUlOTuaHH35g0aJFHDx4sPL9de7cmddff522bdtSUlLCjz/+SE5ODp06daryOlartcrX3ZnyVVxTk+xt27ZlxowZfPLJJ3z00Uc89thjlcXguXz33Xc89dRT9OnThxtvvPG814tI46IeLRFpUnx9fbn22mvp2LEjt9xyC3/961/58MMPKz+ZzsnJoXnz5tXua968OTabjby8PHx8fPDw8ADKv/H8vdzcXLKysrBYLAQGBlY5FxAQQLNmzaocqxh6lZmZWVk0/fG+Zs2aERAQUO2+srKyKnNXoLxQXLt2LStXrmTVqlU1aZKzWrFiBcnJycybN49///vflcdPnToFcNYelPT09Mpfp6SkVGujuuRYsWIFJpOJ4OBgLrroIu6///5q31yfyalTpwgODq7R6yQnJ/P222+zcOFCjh8/Xqes1113HfPmzWPNmjUMGDCADRs2MHv27Mrz7u7uQPm8rd/P1alJz4jVauWll17ivffeo6CggFatWhEdHV35tXguKSkptG7dug7vqKq2bdsya9YsXnrppcphnhX69et33vtbtGhR5ffBwcHYbDZyc3Px9PTkzTffZN68eZw6dYrmzZvTo0cPvLy8qs1/zM3NpW/fvgCEhIRwxRVXVDl/++23V3vtP+abO3cuc+fOxWw207x5c4YMGcL9999/1oVxxowZw/P
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"silhouette_scores = []\n",
"for i in clusters_range[1:]: \n",
" kmeans = KMeans(n_clusters=i, random_state=random_state)\n",
" labels = kmeans.fit_predict(data_scaled)\n",
" score = silhouette_score(data_scaled, labels)\n",
" silhouette_scores.append(score)\n",
"\n",
"# Построение диаграммы значений силуэта\n",
"plt.figure(figsize=(10, 6))\n",
"plt.plot(clusters_range[1:], silhouette_scores, marker='o')\n",
"plt.title('Коэффициенты силуэта для разных k')\n",
"plt.xlabel('Количество кластеров')\n",
"plt.ylabel('Коэффициент силуэта')\n",
"plt.grid(True)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Средний коэффициент силуэта: 0.478\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA00AAAJzCAYAAADTBPhFAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3xUVf7G8c+dmt5JQguh11Ckq6iAuq51Qf1ZsRcUu66rq7uuurquixVQLKuorCv2tWDvld6kSQudkBDSM5l2f38ERoYU0icZnrcvXpJ7Zu79zr1JmGfOuecYpmmaiIiIiIiISLUsoS5ARERERESkNVNoEhERERERqYVCk4iIiIiISC0UmkRERERERGqh0CQiIiIiIlILhSYREREREZFaKDSJiIiIiIjUQqFJRERERESkFgpNIiIiIiIitVBoEpEGmzRpEr179w76M2zYMC666CLmz58f6vJEJMz17t2badOmVdn+66+/Mnr0aI499liys7NrfP60adPo3bs3WVlZlJSUVPuY//73v/Tu3Ztx48Y1Vdki0gYpNIlIo/Tr1485c+YwZ84cXn31VR566CHsdjuXX34569atC3V5InKYWbduHZdccgmRkZHMnj2bzMzMQz7H6/Xy5ZdfVts2d+7cJq5QRNoihSYRaZSYmBgGDx7M4MGDGTp0KMcffzzTpk3DYrHw9ttvh7o8ETmMbNiwgYsvvpjo6Ghmz55N586d6/S8I444go8++qjK9pycHBYuXEjfvn2bulQRaWMUmkSkyUVGRuJ0OjEMI7Bt0qRJTJo0KehxjzzyCL179w4KV7Nnz2b8+PEMGTKECy+8kF9//RWA//znP/Tu3ZtNmzYF7eN///sfffv2ZefOnQB8/vnnnH/++QwZMoQBAwZw0kkn8Z///CfoOXfccUeVYYX7/2zbti3wmIOH47z22mtVhgPNnTuXk08+mcGDBzNx4kQWLlwY9JxD1TNv3jx69+7NvHnzgp538Pmqy/lzu93885//5Nhjj6Vv375Br6u2AHvwvh944AGysrL49ttvgd+GMFX358C663Lud+/ezZ/+9CdGjx4duMZLliwBYNy4cYe8LgsXLuTCCy9k0KBBjBgxgj/96U/k5+cH9v/222/Tu3dvli1bxoQJExg4cCCnnXYaH3/8cVAdxcXF/OMf/+D4448nKyuLU089lTfffDPoMQfW06dPH4YPH87111/P3r17azyXABs3buS6665jxIgRDB8+nKuvvpoNGzbU+Pjazu+B1y07O5sbbriBo446isGDBzNp0iQWLVoUaN+2bVvgee+9917QMb766qtA24Hmzp3LxIkTGTJkCEcddRR//etfKSwsrFLbgar7Xhw3bhx33HFHjV8fbH+tB76+xYsXc84555CVlcVRRx3F/fffj8vlqnEfB9uwYQMXXXQRsbGxzJ49mw4dOtT5uSeffDLff/99lSF6H3/8MV27dqVPnz5VnvP5558zceLEQL1///vfKSsrq/KYuvz8//TTT1x22WUMGjSIo446in/961/4fL7A43744Qf+7//+jyFDhjB8+HCuueaaWr+nRKTpKTSJSKOYponX68Xr9eLxeMjNzeWRRx7B7XZz5pln1vi8LVu2MGvWrKBtn376Kffffz+nnHIKM2bMwOfzMXnyZNxuN6eddhpOp5P//e9/Qc959913GT16NO3bt+frr79mypQp9O/fn6eeeopp06bRuXNn7rvvPpYtWxb0vHbt2gWGFc6ZM4drrrmm1tdZWFjI448/HrRt+fLl3HbbbQwePJinn36a9u3bM3nyZPLy8gDqVU99VXf+nnvuOV566SUuvvhiXnrpJebMmcP06dPrtd/ly5fz3//+l8cff5whQ4YEtR14vv76178GtdXltZaWlnLeeecxb948/vjHPzJ9+nScTieXXXYZ2dnZTJ8+Pajma665JnC81NRUFixYwCWXXEJERASPP/44f/7zn5k/fz4XXXRRlTfXV199NePHj2f69Ol07dqVm266iW+++QYAl8vF+eefz/vvv88VV1zBU089xdChQ7nrrruYOXNm0H6OPfZY5syZwyuvvMKtt97KDz/8wAMPPFDj+cvJyeGcc84hOzubv/3tb/zrX/8iLy+Piy++mIKCglrP/YHn9+Drtn79eiZOnMi2bdu4++67mTp1KoZhcPHFF1e5fzA6OrrKULO5c+disQT/k//UU09xyy23MHjwYJ588kmmTJnCJ598wqRJk+oVVprCzp07ufzyy0lMTGT69OnccMMN/O9//+P222+v0/M3btzIxRdfTExMDLNnzyYtLa1ex//d736Hz+er9rydcsopVR7//vvvM2XKFLp168aMGTO47rrreO+997j22msxTROo38//bbfdxtChQ5k5cyannnoqzz//PG+88QYAW7du5dprr2XAgAE8/fTTPPDAA2zatImrrroKv99fr9cpIg1nC3UBItK2LViwgP79+1fZfsstt9C9e/can/fggw/Ss2dPVq5cGdiWn5/P+eefzy233AJU9pzs/5S+b9++nHDCCbz33nvceOONGIbBrl27+Pnnn/nXv/4FVL6xnDBhAnfddVdgn0OGDGHkyJHMmzePQYMGBbY7HA4GDx4c+Hrjxo21vs4nn3ySDh06BPUy7Nq1i9/97nf8/e9/x2KxkJKSwqmnnsrSpUs5/vjj61VPfVV3/pYvX06fPn247LLLAtv299DU1f6evvHjx1dpO/B8VVRUBLXV5bW+8847bN++nXfeeScw3OmII47gD3/4AwsWLODss88OqjkjIyPomI888ghdu3blmWeewWq1AjBo0CBOOeUU3nrrLS644ILAYydNmsSUKVMAGDNmDBMmTGDGjBkce+yxvP322/z666+89tprgWA4ZswYvF4vTz31FOeeey4JCQkAJCUlBWoYPnw4P/74Y9A5P9isWbNwu928+OKLtGvXDoA+ffpw3nnnsWzZMo499tgan3vgaz34uk2fPh2Hw8HLL79MTEwMAMcddxynnnoqDz/8cFAv2THHHMN3332H2+3G4XBQUVHBF198wfDhwwM9g4WFhTz99NP83//9X1AA7tWrFxdccEGV89ncnnvuORITE5kxY0bg2losFu6++27Wrl1bpbfrQNnZ2Vx00UXk5eXh8XgaFCRSUlIYPnw4H330EaeffjoA27dvZ9myZTz88MM8/fTTgceapsnUqVMZM2YMU6dODWzPzMzkkksu4ZtvvuG4446r18//2WefHfh+HT16NJ9//jlff/015557LsuXL8flcnH11VcHwmB6ejpffPEFZWVlge8HEWleCk0i0ij9+/fn3nvvBSrfTBQVFfHtt9/y2GOPUVZWxs0331zlOd9++y0//vgjzz33HBdddFFg+7nnnguA3++nrKyMTz/9lIiICDp27AjAWWedxQcffMDChQsZPnw47777LtHR0ZxwwgkAXHHFFUBlj8amTZvYsmULK1asACoDWEP9+uuvgd6G/TUCnHjiiZx44omYpklZWRkfffQRFouFrl27Nms9NZ2/rKwsnn32WT755BNGjRpFdHR0nd9AmqbJkiVLmDt3bpUerLqoy2tdtGgRnTp1Cro/JDIykk8++eSQ+y8vL2fZsmVcfvnlgd5NgM6dO9O9e3d++OGHoDf5EyZMCPzdMAxOOOEEpk2bhsvlYv78+XTs2LFKT9rpp5/Om2++GRRu9h/L7/ezZs0aFi1axJFHHlljnYsWLWLw4MGBwASVb3C/+uqrQ77G2syfP5+xY8cGvUG22WyBXtnS0tLA9lGjRvHtt98yb948xowZw7fffktMTAzDhg0LhKalS5fidrs59dRTg44zbNgwOnbsyPz58xsdmvafO4vFUqWXaz+/34/X62XhwoUcffTRgcAEleEPKs9pbaHpgw8+YMCAATz22GNcdtll/PGPf2TWrFlBx/T5fIEeIKj8njjwWFA5RO/vf/87JSUlxMTE8OGHH9K/f3+6dOkS9LiNGzeya9curr766sD3IVSG6piYGH744QeOO+64ev38H/y9mJ6eHhjqN2jQIJxOJ2eddRYnnXQ
"text/plain": [
"<Figure size 1000x700 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from sklearn.metrics import silhouette_score\n",
"# ========================\n",
"# Применение K-Means\n",
"# ========================\n",
"kmeans = KMeans(n_clusters=3, random_state=42) \n",
"df_clusters = kmeans.fit_predict(data_scaled)\n",
"\n",
"# ========================\n",
"# Оценка качества кластеризации\n",
"# ========================\n",
"silhouette_avg = silhouette_score(data_scaled, df_clusters)\n",
"print(f'Средний коэффициент силуэта: {silhouette_avg:.3f}')\n",
"\n",
"# ========================\n",
"# Визуализация кластеров\n",
"# ========================\n",
"from sklearn.decomposition import PCA\n",
"\n",
"pca = PCA(n_components=2)\n",
"df_pca = pca.fit_transform(data_scaled)\n",
"\n",
"plt.figure(figsize=(10, 7))\n",
"sns.scatterplot(x=df_pca[:, 0], y=df_pca[:, 1], hue=df_clusters, palette='viridis', alpha=0.7)\n",
"plt.title('Визуализация кластеров с помощью K-Means')\n",
"plt.xlabel('Первая компонентa PCA')\n",
"plt.ylabel('Вторая компонентa PCA')\n",
"plt.legend(title='Кластер', loc='upper right')\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Средний коэффициент силуэта, равный 0.478, указывает на хорошую кластеризацию. \n",
"\n",
"Средний коэффициент силуэта (silhouette score) указывает на качество кластеризации, измеряя, насколько хорошо точки внутри одного кластера близки друг к другу по сравнению с точками из других кластеров. Значения коэффициента силуэта находятся в диапазоне от -1 до 1:\n",
"\n",
"1: Указывает на идеально плотные и четко разделенные кластеры. \n",
"0: Указывает на перекрытие кластеров или слабую структуру кластеризации. \n",
"Отрицательные значения: Указывают, что точки в кластере расположены ближе к другому кластеру, чем к своему."
]
},
{
"cell_type": "code",
"execution_count": 112,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Средний коэффициент силуэта (агломеративная кластеризация): 0.409\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA00AAAJzCAYAAADTBPhFAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3hTZf8G8Ptkdjfdk26gBVpmGcpG0VcUBQciggiIKCgOHLzqDyeiL7jYQ0VAFBFEEBAFlSV7j7JpoUD3btPM8/ujNjY0LU2bNm16f66LS3ue5OSb86Rp7jzPeY4giqIIIiIiIiIiskhi7wKIiIiIiIgaM4YmIiIiIiKiajA0ERERERERVYOhiYiIiIiIqBoMTURERERERNVgaCIiIiIiIqoGQxMREREREVE1GJqIiIiIiIiqwdBERERERERUDZm9CyCqDyNHjsT+/fvNtrm7u6NNmzaYNGkSunbtaqfKiIioqfrwww9RXFyMl19+GWfPnsXzzz+PPXv2QCqV2rs0IqpnDE3ksNq0aYNp06YBAAwGA3Jzc/Hdd99h7NixWLt2LVq2bGnnComIqCkZPXo0Hn/8cXTv3h1yuRxvvfUWAxNRMyGIoijauwgiWxs5ciQAYPny5WbbS0pK0KNHDzz22GN47bXX7FEaERE1YTqdDleuXIGPjw9UKpW9yyGiBsJzmqhZcXZ2hlKphCAIpm0jR440haxys2bNQuvWrbF27VrTthUrVmDAgAHo2LEjHn/8cZw7dw4A8O2336J169a4fPmy2T5+/vlnxMXF4caNGwCArVu34rHHHkPHjh3Rrl073H333fj222/N7vP666+jdevWFv+lpqaabtO/f3+z+33//fdo3bo1Zs+ebdq2adMm3HPPPejQoQOGDh2KgwcPmt3nVvXs27cPrVu3xr59+8zud/Pxqsnx02q1+Oijj9CnTx/ExcWZPa+Kx/hmN+/7gw8+QHx8PHbs2AEAmD17dpXHq2LdNTn2GRkZeO2119CjRw9THx85cgQA0L9//1v2y8GDB/H444+jffv26Nq1K1577TXk5OSY9r927Vq0bt0ax44dw5AhQ5CQkID77rsPv/76q1kdhYWF+PDDD3HHHXcgPj4e9957L3788Uez21SsJzY2FomJiXjuueeQm5tb5bEEgEuXLpmmpyYmJuLpp5/GxYsXq7x9dce3Yr8lJyfj+eefx+23344OHTpg5MiROHTokKk9NTXVdL/169ebPcaff/5paqto06ZNGDp0KDp27Ijbb78d//d//4f8/PxKtVVk6bXYv39/vP7661X+fLPyWis+v8OHD2PYsGGIj4/H7bffjvfeew+lpaVV7qN8P6+++ip69uyJtm3bokePHnj11VfN+sjS6yo1NbXGr+uMjAxMnToVffr0QUJCAh566CFs27bNrI7y+82fP99s+7lz5yq9hgHbvY6re/4VXw83/yt/b6vJ+0p5LeX/2rVrh7vuusvsNWbpdVJ+XCq+X9b0WM6ePRtyuRzR0dHw9PTEo48+WukYVvdYxcXFGDlyJNq0aQONRmN6rlUdj3IGgwGLFi3Cvffei4SEBHTo0AGPPvoo9u7da/ZYR48exZgxY9CpUyd0794dL730EtLT02t0zAFg9erVGDRoENq1a4e+ffti9uzZMBgMpvbXX38dI0eOxI8//oh+/fqhY8eOeOKJJ3DmzBnTbcr7peIxOX/+PNq2bWvWp0lJSRgxYgQ6duyIO+64A99//73Zczlz5gwmTZqE7t27o23btujVqxfef/99s9+9m/sRqNznll4DO3fuROvWrU3vBZZ+7zUaDQYMGGDx9UPNE0MTOSxRFKHX66HX66HT6ZCZmYlZs2ZBq9XiwQcfrPJ+V65cwdKlS822/fbbb3jvvfcwaNAgzJ07FwaDARMmTIBWq8V9990HpVKJn3/+2ew+69atQ48ePRAUFIS//voLEydORNu2bTFv3jzMnj0bLVq0wLvvvotjx46Z3c/Pzw+rVq0y/XvmmWeqfZ75+fn47LPPzLYdP34cU6ZMQYcOHTB//nwEBQVhwoQJyMrKAgCr6rGWpeO3ePFifPPNN3jiiSfwzTffYNWqVZgzZ45V+z1+/Di+++47fPbZZ+jYsaNZW8Xj9X//939mbTV5rsXFxRg+fDj27duHV155BXPmzIFSqcSYMWOQnJyMOXPmmNX8zDPPmB7P398fBw4cwOjRo+Hk5ITPPvsM//3vf7F//36MGjWq0ofrp59+GgMGDMCcOXMQGRmJF154Adu3bwcAlJaW4rHHHsOGDRswbtw4zJs3D507d8Ybb7yBBQsWmO2nT58+WLVqFZYvX46XX34Zu3fvxgcffFDl8UtPT8ewYcOQnJyMt99+G//73/+QlZWFJ554Anl5edUe+4rH9+Z+u3DhAoYOHYrU1FS8+eabmDlzJgRBwBNPPFHpvEJXV1f88ccfZts2bdoEicT8T9G8efPw0ksvoUOHDvjiiy8wceJEbNmyBSNHjrxlWLG1GzduYOzYsfDy8sKcOXPw/PPP4+eff8arr75a5X3UajVGjRqFixcvYtq0afjyyy8xatQobNy4EZ9++qnZbcv7seLrqVx1r+usrCw89NBDOHjwIF588UXMnj0bISEhmDhxYqVgWtPjbqvX8a2ev7+/f6X3t5t/x25m6X2lXPl9586di6ioKLz22muVvsSqjjXHsqKff/7Z9MVKTa1cuRJZWVn45ptvoFAoTNvbtGlj1t8PPfSQ2f1mzpyJefPmYdiwYViyZAnee+895OXlYfLkyVCr1QCA06dP4/HHH4dGo8HHH3+Md955BydPnsTYsWNrdMwXLlyIt956Cz169MCCBQswYsQILF68GG+99ZZZLUlJSfj0008xadIk/O9//0Nubi4ef/xxZGRkVPm8P/jgA+j1etPParUaTz31FPR6PWbPno3Bgwdj2rRppi/EMjIyMGLECKjVasyYMQOLFy/GoEGDsHz5cixbtsyqY34znU6H6dOn3/J2S5YsqTYMU/PDc5rIYR04cABt27attP2ll15CdHR0lfebPn06WrZsiVOnTpm25eTk4LHHHsNLL70EoGzkpPxb+ri4ONx5551Yv349Jk+eDEEQkJaWhr179+J///sfgLIPlkOGDMEbb7xh2mfHjh3RrVs37Nu3D+3btzdtVygU6NChg+nnS5cuVfs8v/jiCwQHB5t9g52Wloa77roL77//PiQSCXx9fXHvvffi6NGjuOOOO6yqx1qWjt/x48cRGxuLMWPGmLZZ+8eofKRvwIABldoqHq/yb2/L1eS5/vTTT7h27Rp++uknxMXFAQA6deqEBx54AAcOHMDDDz9sVnNYWJjZY86aNQuRkZFYuHCh6fyG9u3bY9CgQVizZg1GjBhhuu3IkSMxceJEAECvXr0wZMgQzJ07F3369MHatWtx7tw5fP/996Zg2KtXL+j1esybNw+PPvqoaTqQt7e3qYbExET8/fffZsf8ZkuXLoVWq8XXX38NPz8/AEBsbCyGDx+OY8eOoU+fPlXet+Jzvbnf5syZA4VCgWXLlsHNzQ0A0LdvX9x77734+OOPzUbJevfujZ07d0Kr1UKhUECj0WDbtm1ITEw0jaDk5+dj/vz5eOSRR8yCQqtWrTBixIhKx7O+LV68GF5eXpg7d66pbyUSCd58802cPXvW4rfQycnJCAwMxEcffYQWLVoAALp3745jx45VCpIV+/Fm1b2uv/76a+Tk5GDLli0ICQkBUBbARo8ejY8//hj33nuvKRT17t0bv/76KzIyMkyhbPPmzWbHHbDd6/hWz7/ie1z5+1tcXBxCQ0MtHgfA8vtKuYr3DQoKwh9//IGkpCRERkZWub/aHstyxcXFmDlzJtq2bVvt711FBoPBdF5tYmKiWZubm5tZf+/cudOsPSMjAy+++KLZSI1SqcRzzz2Hs2fPokOHDliwYAFUKhW++uorKJVKAIC/vz9efvllXLx4sdpjXlhYaAplb775JgCgZ8+eUKlUePPNN/Hkk0+azgMuLCzEggUL0KVLFwBAQkIC7rjjDixbtgxTpkyp9Ly3bNmCY8e
"text/plain": [
"<Figure size 1000x700 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from sklearn.cluster import AgglomerativeClustering\n",
"\n",
"# ========================\n",
"# Агломеративная кластеризация\n",
"# ========================\n",
"agg_cluster = AgglomerativeClustering(n_clusters=3) \n",
"labels_agg = agg_cluster.fit_predict(data_scaled)\n",
"\n",
"# ========================\n",
"# Оценка качества кластеризации\n",
"# ========================\n",
"silhouette_avg_agg = silhouette_score(data_scaled, labels_agg)\n",
"print(f'Средний коэффициент силуэта (агломеративная кластеризация): {silhouette_avg_agg:.3f}')\n",
"\n",
"# ========================\n",
"# Визуализация кластеров\n",
"# ========================\n",
"pca = PCA(n_components=2)\n",
"df_pca = pca.fit_transform(data_scaled)\n",
"\n",
"plt.figure(figsize=(10, 7))\n",
"sns.scatterplot(x=df_pca[:, 0], y=df_pca[:, 1], hue=labels_agg, palette='viridis', alpha=0.7)\n",
"plt.title('Визуализация кластеров с помощью агломеративной кластеризации')\n",
"plt.xlabel('Первая компонентa PCA')\n",
"plt.ylabel('Вторая компонентa PCA')\n",
"plt.legend(title='Кластер', loc='upper right')\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Значение коэффициента силуэта лежит в диапазоне от -1 до 1. Ближе к 1: Хорошо сформированные, плотные кластеры, четко отделенные друг от друга. \n",
"\n",
"Ближе к 0: Кластеры пересекаются или слабо разделены, не имеют четких границ. Точки расположены одинаково близко как к своему кластеру, так и к соседним. \n",
"Ближе к -1 (Отрицательные значения): Некоторые точки скорее относятся к другим кластерам, чем к текущему (ближе к центрам других кластеров). Очень плохая кластеризация. \n",
"Ближе к 1: Все точки внутри каждого кластера плотно сгруппированы и значительно удалены от точек других кластеров. Свидетельствует о четкой и хорошо разделенной структуре данных. Единица говорит об идеальной кластеризации.\n",
"\n",
"Средний коэффициент силуэта, равный 0.409, указывает на то, что кластеры имеют умеренно хорошее разделение, но могут иметь нечеткие границы и неоптимальный выбор числа кластеров. Это может быть связано с особенностями данных, затрудняющими их разделение."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "aimenv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}