1129 lines
1.5 MiB
Plaintext
Raw Normal View History

2024-11-30 04:53:12 +04:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Лабораторная работа №5\n",
"\n",
"*Вариант задания:* Товары Jio Mart (вариант - 23) "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['category', 'sub_category', 'href', 'items', 'price'], dtype='object')\n"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from scipy.cluster.hierarchy import dendrogram, linkage, fcluster\n",
"from sklearn.cluster import KMeans\n",
"from sklearn.decomposition import PCA\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.metrics import silhouette_score\n",
"\n",
"df = pd.read_csv(\"..//static//csv//jio_mart_items.csv\")\n",
"df = df.iloc[:15000]\n",
"print(df.columns)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>category</th>\n",
" <th>sub_category</th>\n",
" <th>href</th>\n",
" <th>items</th>\n",
" <th>price</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Groceries</td>\n",
" <td>Fruits &amp; Vegetables</td>\n",
" <td>https://www.jiomart.com/c/groceries/fruits-veg...</td>\n",
" <td>Fresh Dates (Pack) (Approx 450 g - 500 g)</td>\n",
" <td>109.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Groceries</td>\n",
" <td>Fruits &amp; Vegetables</td>\n",
" <td>https://www.jiomart.com/c/groceries/fruits-veg...</td>\n",
" <td>Tender Coconut Cling Wrapped (1 pc) (Approx 90...</td>\n",
" <td>49.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Groceries</td>\n",
" <td>Fruits &amp; Vegetables</td>\n",
" <td>https://www.jiomart.com/c/groceries/fruits-veg...</td>\n",
" <td>Mosambi 1 kg</td>\n",
" <td>69.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Groceries</td>\n",
" <td>Fruits &amp; Vegetables</td>\n",
" <td>https://www.jiomart.com/c/groceries/fruits-veg...</td>\n",
" <td>Orange Imported 1 kg</td>\n",
" <td>125.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Groceries</td>\n",
" <td>Fruits &amp; Vegetables</td>\n",
" <td>https://www.jiomart.com/c/groceries/fruits-veg...</td>\n",
" <td>Banana Robusta 6 pcs (Box) (Approx 800 g - 110...</td>\n",
" <td>44.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" category sub_category \\\n",
"0 Groceries Fruits & Vegetables \n",
"1 Groceries Fruits & Vegetables \n",
"2 Groceries Fruits & Vegetables \n",
"3 Groceries Fruits & Vegetables \n",
"4 Groceries Fruits & Vegetables \n",
"\n",
" href \\\n",
"0 https://www.jiomart.com/c/groceries/fruits-veg... \n",
"1 https://www.jiomart.com/c/groceries/fruits-veg... \n",
"2 https://www.jiomart.com/c/groceries/fruits-veg... \n",
"3 https://www.jiomart.com/c/groceries/fruits-veg... \n",
"4 https://www.jiomart.com/c/groceries/fruits-veg... \n",
"\n",
" items price \n",
"0 Fresh Dates (Pack) (Approx 450 g - 500 g) 109.0 \n",
"1 Tender Coconut Cling Wrapped (1 pc) (Approx 90... 49.0 \n",
"2 Mosambi 1 kg 69.0 \n",
"3 Orange Imported 1 kg 125.0 \n",
"4 Banana Robusta 6 pcs (Box) (Approx 800 g - 110... 44.0 "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>price</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>15000.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>373.427633</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>463.957949</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>5.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>123.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>250.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>446.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>14999.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" price\n",
"count 15000.000000\n",
"mean 373.427633\n",
"std 463.957949\n",
"min 5.000000\n",
"25% 123.000000\n",
"50% 250.000000\n",
"75% 446.000000\n",
"max 14999.000000"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"category 0\n",
"sub_category 0\n",
"href 0\n",
"items 0\n",
"price 0\n",
"dtype: int64\n",
"category False\n",
"sub_category False\n",
"href False\n",
"items False\n",
"price False\n",
"dtype: bool\n"
]
}
],
"source": [
"# Процент пропущенных значений признаков\n",
"for i in df.columns:\n",
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
" if null_rate > 0:\n",
" print(f'{i} Процент пустых значений: %{null_rate:.2f}')\n",
"\n",
"print(df.isnull().sum())\n",
"\n",
"print(df.isnull().any())"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"category object\n",
"sub_category object\n",
"href object\n",
"items object\n",
"price float64\n",
"dtype: object"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Проверка типов столбцов\n",
"df.dtypes"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Атрибуты \n",
"\n",
"category: Категория товара.\n",
"\n",
"sub_category: Подкатегория товара.\n",
"\n",
"href: Ссылка на товар.\n",
"\n",
"items: Название товара.\n",
"\n",
"price: Цена товара.\n",
"\n",
"# Цель:\n",
"Оптимизация стратегий ценообразования и маркетинга для розничных компаний, стремящихся привлечь покупателей с различными предпочтениями.\n",
"Кластеризация товаров на основе их характеристик (категория, подкатегория, цена) для выявления групп с похожими профилями."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Очистка данных"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Цель: Упростить набор данных, удалив несущественные столбцы, чтобы сосредоточиться на ключевых атрибутах, которые будут использоваться для кластеризации и анализа.\n",
"\n",
"Столбцы href и items несущественны для анализа, они не содержат ценной информации для решения задачи."
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" category sub_category price\n",
"0 Groceries Fruits & Vegetables 109.0\n",
"1 Groceries Fruits & Vegetables 49.0\n",
"2 Groceries Fruits & Vegetables 69.0\n",
"3 Groceries Fruits & Vegetables 125.0\n",
"4 Groceries Fruits & Vegetables 44.0\n"
]
}
],
"source": [
"# Удаление несущественных столбцов\n",
"columns_to_drop = ['href', 'items']\n",
"df_cleaned = df.drop(columns=columns_to_drop)\n",
"\n",
"print(df_cleaned.head()) # Вывод очищенного DataFrame"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Визуализация парных взаимосвязей\n",
"Визуализировать ключевые атрибуты миллиардеров для выявления закономерностей и связей между ними."
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABjAAAA1TCAYAAABQAC/wAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzde5zWdZ3//+cMDGeYgOQUi5wSUFAoQWyjjMzKzHLZbVsFy7Is26+ZmvVTK9NvtqWmedo8prK6aumaq9W6tbVbbSK4qRjDKuKoBIgHHM4zDFy/P64vYxPkjIBeHy7v99uNG8z7/bk+vqB/YB593p+aUqlUCgAAAAAAQIHUVnoAAAAAAACAPyVgAAAAAAAAhSNgAAAAAAAAhSNgAAAAAAAAhSNgAAAAAAAAhSNgAAAAAAAAhSNgAAAAAAAAhSNgAAAAAAAAhSNgAAAAO6VUKlV6BAAAoIoJGAAA8DqxcOHCfPGLX8whhxyS/fffP4ceemi+8pWv5Omnn37F93rsscfyd3/3d6/ClNVh3rx5GTduXLsf48ePz1ve8pZ89KMfzX/8x390eI8vf/nLmTlz5mswLQAAFFPXSg8AAAC8+m666aacd955Oeigg3Lqqadm0KBBefLJJ3Pttdfm3nvvzQ033JDx48d3+n4//elP87vf/e5VnLg6fPWrX81+++2XpPzESlNTU6677rqceOKJufLKK/POd77zz372xBNPzLHHHvtajQoAAIUjYAAAQJV74IEH8o1vfCPHHHNMzjzzzLb1gw46KIceemg+/OEP54wzzsgdd9xRwSmr09ixYzN58uR2awceeGAOOeSQ3HjjjS8bMEaMGPEqTwcAAMXmCCkAAKhy1157bfr27ZtTTjllu70BAwbky1/+ct797ndnw4YNSZJNmzblwgsvzGGHHZaJEyfmLW95S4477rg0NDQkSS699NJcdtllSZJx48bl0ksvTZJs3bo1V111Vd7znvdk4sSJee9735u5c+fucJ53v/vd2X///duOUxo3blzmzZvXds3ChQvzyU9+MgcddFDe8pa35DOf+Uwee+yxtv1tRzTdcsstede73pW3vOUt+dnPfpZx48bl17/+dbv/3oIFCzJu3Lg88MAD283yr//6rxk3blweffTRduvb7rVo0aIkyQ033JD3ve99mTRpUmbMmJGzzz4769at6/gPfwf69OmTUaNGZfny5X/29/Kb3/xmuyOkSqVSrr/++rz//e/P/vvvn/e85z259tpr272LZMGCBZk9e3YOOOCATJs2LV/60pfywgsv7NScAABQaZ7AAACAKlYqlfLrX/86M2fOTM+ePXd4zeGHH97u69NPPz0LFizIKaeckhEjRuTJJ5/Md7/73Zx66qm555578jd/8zdZuXJlfvjDH+bWW2/NkCFDkiRnn3127rjjjpxwwgmZMmVK5s+fn/POOy9r1qzJ5z73uSTJZZddlssvvzyf/OQnM3369PzqV7/KySef3O6/f9999+X444/PQQcdlPPOOy/Nzc258sor89GPfjS33XZbxowZ03btZZddlrPOOiubNm3K2972tgwaNCg/+tGP8va3v73tmjvvvDMjR47MW9/61u1+74ceemh69eqVe+65J/vss0/b+t133503v/nN2XfffXP33Xfn/PPPz5e+9KWMGzcuS5cuzbe+9a1s3Lgx3/rWt17Z/yBJWlpasmzZsuy///7t1v/49zJlypT867/+a7v9b3/727nhhhty3HHH5S//8i+zcOHCXHDBBWltbc0JJ5yQ+fPn57jjjsv06dNz8cUXp6mpKd/97ndz7LHH5oc//GF69OjximcFAIBKEjAAAKCKrV69Os3NzRk+fHinrm9pacn69etz1llntYWNadOmZd26dfmHf/iHPPfccxkyZEhbtNh2PNITTzyR2267Laeccko+/elPJ0ne/va3p6amJldeeWWOPvrodO/ePVdffXWOOeaYnHbaaW3XbNy4MbfeemvbDBdeeGH23nvvXHXVVenSpUvbde95z3tyySWX5Lvf/W7btUcffXTe9773tX191FFHZe7cuVm/fn169+6dTZs25Sc/+UnbTH+qZ8+eee9735sf//jH+cIXvpAkWb9+fX7xi1+0RZf7778/w4cPzzHHHJPa2tpMmzYtvXr1SlNTU4d/nlu3bk1ra2uSpLW1NX/4wx9yxRVX5IUXXsgxxxzT7to//b38sTVr1uTGG2/M7Nmz88UvfjFJ8ra3vS3PPvts5s+fnxNOOCEXXnhhRo0alSuvvLLtz+2AAw7IBz7wgdx+++3b/fcAAKDoHCEFAABVbNs3srds2dKp67t165Zrr702hx9+eJ555pncd999ueWWW/KLX/wiSTlw7Mh9992XUqmUmTNnprW1te3HzJkz09zcnAceeCAPPvhgNm3atN036Y844oi2X2/YsCELFy7M+9///rbZk6Rfv35517velfvvv7/dZydMmNDu61mzZmXDhg3593//9yTJv//7v2fDhg358Ic//Gd/zx/60Ify1FNP5eGHH06S/PznP09LS0uOPPLIJMn06dPzxBNP5K/+6q9y2WWXZeHChfngBz+YOXPmvNwfZZLk4x//ePbbb7/st99+OeCAA3L44Yfnt7/9bc4666y84x3veNnfyx978MEH09ramsMOO6zd+llnnZVrrrkmGzduzEMPPZR3vvOdKZVKbX/+f/EXf5ExY8bkN7/5TYezAgBA0XgCAwAAqlh9fX169+7d9r6FHdmwYUM2b96c+vr6JMmvfvWrnHfeeVm6dGl69+6d8ePHp1evXknS7n0Lf+zFF19MknzgAx/Y4f4zzzzTdv8BAwa02xs4cGDbr9euXZtSqZQ3vvGN293jjW98Y9auXdtubdtc2+y9996ZNm1a7rzzznz4wx/OnXfembe97W0ZPHjwn/vt56CDDsrgwYNzzz33ZP/9988999yTadOmtT1lcvjhh2fr1q25+eabc8UVV+TSSy/Nm970ppx22mnbHb/1p77+9a9nv/32S1KOSfX19Rk2bFhqamq2u/ZPfy9/bNuf75/+2W2zZs2abN26NVdffXWuvvrq7fa7d+/+snMCAEARCRgAAFDl3v72t2fevHlpbm7e4Teyb7vttnzrW9/KD3/4w/Tt2zef+9zncuihh+bKK6/MX/zFX6SmpiY33XRTfvWrX/3Z/0a/fv2SlF923bt37+32hw0blieeeCJJ8vzzz2f06NFte3/8kum+ffumpqYmzz333Hb3ePbZZ/OGN7yhw9/vrFmzcsYZZ+Txxx/Pb3/721xwwQUve31tbW0++MEP5u67785nPvOZ/OY3v8k555zT7pojjjgiRxxxRNauXZtf//rXufrqq/PFL34xb33rW182jowaNSqTJk3qcOaObPvzfeGFF9r92S1fvjxPPfVUJk6cmJqamnz84x/fYUT6c+8/AQCAInOEFAAAVLlPfOITefHFF3PxxRdvt/fss8/muuuuy9ixY7PffvvlkUceSXNzcz796U9nxIgRbU8KbIsX257AqK1t/0+JAw88MEn5nRuTJk1q+/HCCy/ku9/9bl588cWMHz8+ffv2bTveaZt777237de9evXKxIkT85Of/KTdsVdr167NL3/5yx2+iPtPvfe9703Pnj1z9tlnp3fv3jn00EM7/MyHPvShrFy5Mpdffnm6dOnS7qimk08+ue19GH379s373//+nHjiiWltbc2qVas6vPfusP/++6eurq7tKK9trrvuupxyyinp1atX9t133yxdurTdn/+b3/zmXHrppZk3b95rMicAAOxOnsAAAIAqN3ny5Hz+85/PxRdfnMcffzwf/vCH079//zz22GO59tpr09zc3BY39ttvv3Tt2jXnn39+PvGJT6SlpSV33HFHfvnLXyYpHzeVvPREwN13350DDjgg48aNy5FHHpmvfOUr+cMf/pCJEyfmiSeeyEUXXZThw4dn5MiR6dKlS44//vhccskl6dmzZ6ZNm5b7778///zP/5zkpShy6qmn5pOf/GQ+/elP5+ijj87mzZtz1VVXpaWlpS0kvJyePXvmAx/
"text/plain": [
"<Figure size 1600x4500 with 3 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"\n",
"# Настройка стиля графиков\n",
"sns.set(style=\"whitegrid\")\n",
"\n",
"# Создание фигуры\n",
"plt.figure(figsize=(16, 45))\n",
"\n",
"# График 1: Категория vs Цена\n",
"plt.subplot(4, 1, 1)\n",
"sns.scatterplot(x=df_cleaned['category'], y=df_cleaned['price'], alpha=0.6, color='blue')\n",
"plt.title('Category vs Price')\n",
"plt.xlabel('Category')\n",
"plt.ylabel('Price')\n",
"plt.xticks(rotation=90)\n",
"\n",
"# График 2: Подкатегория vs Цена\n",
"plt.subplot(4, 1, 2)\n",
"sns.boxplot(x=df_cleaned['sub_category'], y=df_cleaned['price'], color='green')\n",
"plt.title('Sub-Category vs Price')\n",
"plt.xlabel('Sub-Category')\n",
"plt.ylabel('Price')\n",
"plt.xticks(rotation=90)\n",
"\n",
"# График 3: Категория vs Подкатегория\n",
"plt.subplot(4, 1, 3)\n",
"sns.countplot(x=df_cleaned['category'], hue=df_cleaned['sub_category'], palette='Set3')\n",
"plt.title('Category vs Sub-Category')\n",
"plt.xlabel('Category')\n",
"plt.ylabel('Count')\n",
"plt.xticks(rotation=90)\n",
"\n",
"# Упорядочиваем графики\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Стандартизация данных для кластеризации"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" price category_Groceries sub_category_Dairy & Bakery \\\n",
"0 -0.569958 1.0 0.0 \n",
"1 -0.699284 1.0 0.0 \n",
"2 -0.656175 1.0 0.0 \n",
"3 -0.535471 1.0 0.0 \n",
"4 -0.710061 1.0 0.0 \n",
"\n",
" sub_category_Fruits & Vegetables sub_category_Premium Fruits \\\n",
"0 1.0 0.0 \n",
"1 1.0 0.0 \n",
"2 1.0 0.0 \n",
"3 1.0 0.0 \n",
"4 1.0 0.0 \n",
"\n",
" sub_category_Snacks & Branded Foods sub_category_Staples \n",
"0 0.0 0.0 \n",
"1 0.0 0.0 \n",
"2 0.0 0.0 \n",
"3 0.0 0.0 \n",
"4 0.0 0.0 \n"
]
}
],
"source": [
"from sklearn.preprocessing import OneHotEncoder, StandardScaler\n",
"import pandas as pd\n",
"\n",
"# Выделяем числовые и категориальные признаки\n",
"numerical_cols = ['price']\n",
"categorical_cols = ['category', 'sub_category']\n",
"\n",
"# Масштабирование числовых признаков\n",
"scaler = StandardScaler()\n",
"df_numerical_scaled = scaler.fit_transform(df_cleaned[numerical_cols])\n",
"\n",
"# Кодирование категориальных признаков с помощью OneHotEncoder\n",
"encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False) # sparse=False для удобства\n",
"encoded_data = encoder.fit_transform(df_cleaned[categorical_cols])\n",
"\n",
"# Создаем новые столбцы для закодированных категориальных признаков\n",
"encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_cols))\n",
"\n",
"# Объединяем числовые и закодированные категориальные данные\n",
"df_encoded = pd.concat([pd.DataFrame(df_numerical_scaled, columns=numerical_cols), encoded_df], axis=1)\n",
"\n",
"# Выводим результат\n",
"print(df_encoded.head())"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABR4AAAP0CAYAAADMUCCZAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdeXxddZ0//te9TdJsbYoBStsUgVYLKCBSEBcQi0VGkRHRGVxQ+elXVBgVdETcly+oiAwqygzjhiiDGyLofPXLJigqWB0VFUHK1mBpS+iapbnpvb8/sPkSupDQE5K0z+fjwYPmnHPPead9N2le97OUarVaLQAAAAAABSqPdQEAAAAAwPZH8AgAAAAAFE7wCAAAAAAUTvAIAAAAABRO8AgAAAAAFE7wCAAAAAAUTvAIAAAAABRO8AgAAAAAFE7wCADANqvVamNdAqPMnzEAMFKCRwBgXDvxxBMzb968If89/elPzxFHHJGPfvSjWb169Savufvuu/ORj3wkL3zhC7P//vvniCOOyOmnn56//OUvW3zOv/3bv2XevHn5+Mc//pg1nXTSSTnkkEPS39+/xWte+tKX5jWveU2SZN68efn85z8/jM+2WO9973uzYMGCwY8XLFiQ9773vYU+44EHHsib3/zm3H///aP6nG2xbt26vOUtb8kBBxyQgw8+OPfcc88m11x++eWb9Nk+++yTgw8+OP/f//f/5Te/+c1jPufEE0/MiSeeOAqfwVDvfe97N6n1kf/9+Mc/LvR5/f39Ofvss3PVVVcVel8AYPtXN9YFAAA8ln333Tcf/vCHBz+uVCr505/+lPPOOy+33XZb/uu//iulUilJ8n//7//Ne97znjzlKU/JW9/61nR0dOSBBx7IxRdfnH/6p3/KhRdemOc+97lD7l+tVnPFFVfkqU99an7wgx/k3e9+d5qamrZYz/HHH59f/OIXufHGG/PCF75wk/N/+tOfcscdd+RTn/pUkuRb3/pWdttttyJ+K7bJBRdckNbW1kLv+Ytf/CI33HDDqD9nW1xxxRW5/vrr86EPfShPecpT0tHRscVrL7jgguyyyy5JHu6LBx98MF/4whfy+te/Pt/97nez9957b/G1j+zR0bbLLrvkggsu2Oy5PfbYo9BnLV++PBdffHE+8YlPFHpfAGD7J3gEAMa91tbWPOMZzxhy7OCDD053d3c+97nP5fe//32e8Yxn5L777ssZZ5yRww47LOeff34mTZo0eP1RRx2VV73qVTnjjDNy3XXXpaGhYfDcz3/+8zzwwAM577zz8trXvjY//OEP88pXvnKL9SxcuDBtbW258sorNxs8fv/7309ra2te9KIXJckmtY+Vfffdd7t6znCtWrUqSfLqV796MKDekn322WeTYHLffffNwoULc+mll+ZjH/vYFl87d+7cba51uBoaGsZNXwEAbImp1gDAhPX0pz89SfK3v/0tSXLJJZekv78/H/jAB4aEjknS1NSUM844I8cff/wm07O/973v5alPfWoOOuigPOtZz8q3vvWtrT538uTJOeaYY/LTn/4069atG3KuUqnkRz/6UV7ykpcMjpp89FTriy++OEcffXT222+/HHbYYfnIRz4yeJ/Ozs7Mmzcvl19++ZD7Pnra9IYNG3LRRRflmGOOyf77759nPOMZOeGEE/KrX/1qi3U/cgr05z//+S1O1d1Y62M94/LLL8+ZZ56ZJDnyyCMH7/3oqdZr167NJz7xibzwhS/Mfvvtl2OOOSbf/e53N6ntc5/7XD71qU/lOc95Tvbff/+88Y1v3Oy06Edav359vvCFLwz+fh511FG56KKLUq1Wkzw8/Xnj57P33ns/ringHR0d2WmnnQb77PLLL8++++6b73znO3nuc5+bQw45JHfeeecmU637+/tz/vnn58gjj8z++++fY445Jt///veH3Puaa67Jy1/+8uy333557nOfm//9v/93enp6Rlzjlgzn/tdcc01e/epX58ADD8zTn/70HH300fnmN7+Z5OF+PPLII5MkZ5555mAPbm5a+c0335x58+bl5ptv3urv0xPxeQMA44MRjwDAhHX33XcnSWbPnp0k+dnPfpZ9990306dP3+z1z372s/PsZz97yLFVq1bluuuuyzvf+c4kyXHHHZf3vOc9+dOf/pSnPe1pW3z28ccfn29+85v5yU9+kuOPP37w+I033piHHnpoiyMmf/jDH+bTn/50zjjjjMybNy933XVXPvWpT6W3t3dwavZwnHvuufmv//qvvOtd78q8efOybNmyfOELX8g73vGO/PSnP93qVPEkeeUrX5nDDjtsyLFzzjknf/nLX/LiF794WM844ogj8ta3vjUXXnhhLrjggsybN2+T5/T19eXVr351urq68va3vz2zZs3KNddck/e///158MEH85a3vGXw2q9//es56KCD8olPfCKrV6/OWWedlTPOOGOLQXCtVstb3vKW/O53v8upp56avffeOzfffHPOP//8LFmyJB//+Mfz4Q9/OF/96lfz3e9+N9/61rfypCc9adi/xxutXLkyK1euzO677z54bMOGDfnKV76Ss846KytXrsycOXM2ed273/3u3HDDDXnrW9+aAw44IDfccEPe+973pr6+Psccc0yuuuqqvPvd785LX/rSvPOd78z999+ff/u3f8udd96Zr371q485OnNgYGCTY5MmTRp83XDu/9Of/jSnnHJKXve61+Vf/uVf0tfXNziy8+lPf3r22WefXHDBBTn11FPz1re+NUcdddSIfu829/u0rZ83ADBxCB4BgHGvVqsNCVlWr16dW265JRdeeOHgKK3k4Y1O9tlnnxHd+6qrrkq1Ws0//uM/Jnl4SvbHPvaxXHbZZVvdaOZpT3ta9tlnn1x11VVDgscrrrgi8+bNy3777bfZ191yyy3p6OjIa17zmpTL5RxyyCFpbm7e7CY5W7N8+fKcdtppQ0adTZ48Of/yL/+S22+//TGn4e62225D1p382te+lt/+9re54IILBkO04TxjYxi3uSnKycOj3u64445cdtllOfDAA5Mkhx12WAYGBvLFL34xJ5xwQqZNm5YkmTp1ar74xS8Ojla977778vnPfz4rV67MTjvttMm9b7zxxvziF7/Ieeedl5e85CVJkuc+97lpbGzMZz/72bzuda/LU57ylMHPczhTk6vV6mCvrV+/Pvfcc0/OPffclMvl/PM///OQa9/ylrfkiCOO2Ox97rjjjvzkJz/J+973vrz+9a9P8nDwff/99+fmm2/OS17ykpx77rk57LDDcu655w6+bo899sgb3vCG3HDDDVu8d5Lcf//9mw3G3/Wud+XNb35zarXasO5/55135rjjjsv73//+wWsOPPDAPOtZz8rNN9+cAw44YPDv1O677/64ptE/8vdpuHUBANsHwSMAMO79+te/3iRkKZfLec5znpOPfexjgyOkJk2alA0bNozo3t/73vfyrGc9Kw0NDVmzZk2Sh6f9/vCHP8wZZ5yx1U1Sjj/++Jx99tlZtmxZpk+fnlWrVuX666/Pe97zni2+5tBDD823vvWtvPzlL88LX/jCPP/5z89LX/rSEY/y+sxnPpMkeeihh3LXXXfl3nvvzfXXX58kW91te3N+9rOf5Zxzzsnb3va2IWtWFvGMW265JbNmzRoMHTc69thj893vfje///3v8/znPz9Jst9++w2ZIr8xMOzt7d1s8HjLLbekrq4uRx999Cb3/uxnP5tbbrklT3nKU4ZV50YLFy7c5NisWbPy6U9/epMRnVsLuTfugv3oEYIbp30vXrw4DzzwQE4++eQhofrBBx+c1tbW3HTTTVsN4HbZZZdceOGFmxzf+Ht21113Dev+b3rTm5Ik3d3dufvuu3Pffffl1ltvTTLyPtqSR/4+DbcuAGD7IHgEAMa9pz3tafnoRz+aJCmVSpk8eXJmzJixSSg4c+bMwXX4NqdSqWT16tXZeeedkyR//vOfc9tttyV5OPh4tCuvvDKvfvWrt3i/l770pTnnnHPy3//93znppJPyox/9KKVSKccee+wWX/PiF7841Wo1l156ab74xS/
"text/plain": [
"<Figure size 1600x1200 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from sklearn.decomposition import PCA\n",
"import matplotlib.pyplot as plt\n",
"\n",
"# Применение PCA ТОЛЬКО к числовым данным\n",
"pca = PCA(n_components=1)\n",
"kc_pca = pca.fit_transform(df_numerical_scaled)\n",
"\n",
"# Визуализация\n",
"plt.figure(figsize=(16, 12))\n",
"plt.scatter(range(len(kc_pca)), kc_pca, alpha=0.6)\n",
"plt.title(\"PCA Visualization of Price Feature\")\n",
"plt.xlabel(\"Sample Index\")\n",
"plt.ylabel(\"Principal Component 1\")\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Агломеративная (иерархическая) кластеризация"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABSoAAAP5CAYAAAAR1hEFAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACd5klEQVR4nOzdd5hV9b3/7ffgDAKKDRUVKxq7IBbUkyiKxngieoImHmONJTbQxHIssXdjsCCoEY1RFGPDemKisURzEkWxJ2qMNYICKoIVGIb9/MHD/jEwUobRL8J9X5eXM2vt8tmFDfOaVWoqlUolAAAAAAAFtSo9AAAAAACAUAkAAAAAFCdUAgAAAADFCZUAAAAAQHFCJQAAAABQnFAJAAAAABQnVAIAAAAAxQmVAAAAAEBxQiUAAAAAUJxQCUCLOfHEE7POOus0+d+JJ55YejxgOuPHj8+mm26aF154IePHj8/hhx+ea6+9tvRYzAdeeOGFfO9738ukSZNKjzJfeOihh7Lbbrvl3XffzYgRI9KzZ8/8/e9/Lz3WfG/EiBFZZ511MmDAgNxxxx1ZZ511MmzYsHm+3Wm32VJmNec+++yT++67r8XuC4DZqy09AAALluWWWy4DBw5stKxv376FpgG+zJJLLpkDDjgge+yxRyqVStZZZ5388pe/LD0WhU2cODEnnHBC/ud//ietW7cuPc58oUePHrnpppuy3XbbJUl69+6dDTfcsPBU87+amprq/6f/el7dcsstWWGFFeb5dqaZ1Zy/+MUvctBBB2WLLbZIhw4dWuw+AfhyQiUALaahoSHt2rXLxhtv3Gi5H3Zh/tS3b9/sueee+fjjj7PaaqtlkUUWKT0Shd10002pra3NDjvsUHqU+UZtbW1+85vf5J133skiiyySlVZaqfRI3wjLLbdcamtr07Fjx2pYXHHFFef5dmf8N8a8mtWc66+/frp06ZIrr7wyp5xySoveLwBNs+s3AC1m8uTJadOmzRxddvjw4dlnn33StWvXdO/ePSeccELGjh1bXT9t96sRI0Y0ul7Pnj0b7UZeX1//pbubz3hbzz//fHr37p0uXbpkl112yR//+MdGt/3JJ5/k/PPPzw477JCNNtoovXr1yu233z7T/c94PyNGjMi+++6bE088Mb/+9a/zH//xH9l0001zxBFHZOTIkY2u/+CDD2avvfZKt27dsuGGG2annXbKkCFDquuHDRtWvd2nn3660XVvvPHGrLPOOunZs+dM88z4A9T48eOz4YYbzrSr3ezu/8vcdttt2W233bLxxhunS5cu+a//+q/84Q9/mOk5bmp3/y97ffbdd99G93Hfffdlt912S7du3fLtb387p512WsaPH19dP2DAgKyzzjrp1q3bTLukHnXUUTMdYmDixIm58MIL06NHj2y44YbZZZddZtqFr2fPnrnkkkty3nnnZfPNN88WW2yR448/PuPGjZvjxz+rQx7ccccd1dd0+tfhww8/zGabbdbka7nOOutk3XXXzeabb54jjzwyH330UfUyTe3yOO15ac5zmSTLLrtsOnfunL/97W+zPUzDjPf1+9//PptvvnkuuuiiJI3fvzP+N/3cr7zySvr27Zstt9wyG2ywQbbeeuucc845mTBhQvUykyZNyqWXXprtt98+Xbp0Sa9evXLnnXfO0XOeJO+++26OOeaYdO/ePV27ds3++++fl156qXr703b3/P3vf5/DDjssXbt2zbbbbpvLL788U6ZMafS6zPicHHPMMY1e00qlkv79+2frrbfOpptumsMOOyzvvfde9fINDQ0ZNGhQevXqlS5dumTjjTfOnnvumSeeeGKWr2My82s+4/eVSiV77rlno8/LE088sdF7K0luvvnm2e4yO2nSpPz2t79Nr169qsv23Xffmf6szviebmr2v/zlLzO9nz799NOcffbZ2XrrrbPxxhtn9913z5///OeZbnd275+JEyfm8ssvz0477ZSNNtooO+64YwYNGtToddt3330bXX+TTTbJgQcemHfeeWeub2fa419llVWy0kor5aKLLmr0XmvKjPc/498ZydTXad99983tt9+e7bbbLt26dcv++++fV155pXo7Tf1d+K9//SsbbLBBo9fl5Zdfzt57751u3bplhx12yM033/ylr1dTjy1Jxo4dmzPPPDPbbbddNtxww3Tv3j19+vRpdN8zXufcc8/NRhttlMcee2ym56B169bp3Llz1lprray77rpp165dVl555SRz9tl74oknZv/998/pp5+eTTbZJN///vfT0NAw0/thzJgxOeGEE7LVVlulW7du2WefffLss89W10+ZMiWDBg3Kd7/73Wy44Yb53ve+lxtuuGGO5kySXXbZJbfffnujf6MA8NWxRSUALeaLL77IkksuOdvLPfXUUznggAOy5ZZb5tJLL8348ePTv3//7Lfffrn99tvnOHYmU3/QTJIrr7wyyyyzTJKpUWnGwJgkhx56aPbZZ58cffTRuf322/Pzn/88V111VXr06JEJEyZkr732yocffpijjjoqnTp1yoMPPpiTTz45H3zwQQ477LDq7fTo0SNHHHFE9fvll18+ydTjmC299NI55ZRTMmXKlFx00UXZd9998/vf/z5t27bNn//85/Tp0yf77bdfjjzyyEyYMCE33XRTzjrrrGy44Ybp2rVr9TYXW2yxPPzww9l0002ry+677760ajXz7xgXW2yx/PnPf06lUqnurvbAAw+koaGh0eXm5v6nN2TIkJxzzjk58sgjs+mmm2b8+PG5+uqrc9xxx6Vbt26NdsEbOHBglltuuSSpvh5J8sMf/jA/+tGPqt+feeaZje7jiiuuyGWXXZa99torRx99dN555530798/zz33XG699dZG74mampo8/vjj6dGjR5Lks88+y6OPPtroualUKunTp0+eeeaZHHXUUVlzzTXzpz/9KUcffXQmTZqUH/zgB9XL3nTTTVlttdVy/vnnZ+zYsbnooovy9ttv5+abb05NTc1sH/8RRxyRPffcM8nULRTXX3/96vtj1VVXzb/+9a+ZntOLLroon3zySZZYYolGy6e9t+rr6/P666/nwgsvzLnnnpt+/fo1+do0ZW6ey2nq6+tz3nnnzfF9JMmECRNy1lln5eCDD84uu+zSaN1pp52WDTbYoPr9f//3f1e/HjNmTPbee+9svPHGueCCC9K6des89thj+e1vf5vll18+hxxySJLkuOOOy6OPPprDDz88Xbt2zaOPPpoTTzwxdXV1s33Ox44dmz333DNt27bNqaeemrZt2+b666/P3nvvndtvvz1rrrlmdZ4zzjgjPXr0yIABA/L0009n4MCB+fzzz/M///M/TT7u4cOH5/e//32jZdddd12uuuqqHH/88VljjTVywQUX5Gc/+1luvfXWJEm/fv3yu9/9Lscee2zWWWedjB49Opdffnl+9rOf5c9//nPatm07V8/99O6+++5GUaYp48ePz6WXXjrb2xo2bFhGjx6dHXfcsdnzJE2/nxoaGnLggQfmrbfeylFHHZXOnTvnzjvvTJ8+fXL99ddns802q152Vu+fSqWSww47LM8991z69u2bddddN8OGDcull16ad955J2effXb1suuvv35OP/30TJ48OSNGjMhFF12U448/Pr/73e/m6nam9+9//zvXXXfdHD0P0+5/mj//+c+58sorG13m5ZdfzhtvvJFjjjkmSy65ZC677LLqcRGn/d0yo3PPPTeTJ0+ufv/FF1/kpz/9aTp16pQBAwbkmWeeyemnn56VVlop22yzzRzNWqlUcuihh2b8+PE57rjjsuyyy+af//xnLr300px++un5zW9+M9N1Xnjhhfzud79L//79061btyZv9957761+PeP7dHafvcnUP2+LLrpoLr/88nz++eczbfX92Wef5cc//nEaGhryP//zP+nYsWOuvfbaHHjggbnzzjuz+uqr54wzzsgdd9yRQw89NN26dctTTz2V8847Lx9//HH69Okz2zl79uyZhoaG/OlPf2r0XgTgqyFUAtBixo0b96U/WE3voosuyhprrJGrrrqq+kNH165
"text/plain": [
"<Figure size 1600x1200 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[1 1 1 ... 1 2 1]\n"
]
}
],
"source": [
"# Построение дендрограммы (только для числовых данных)\n",
"linkage_matrix = linkage(df_numerical_scaled, method='ward')\n",
"\n",
"plt.figure(figsize=(16, 12))\n",
"dendrogram(linkage_matrix)\n",
"plt.title('Дендрограмма агломеративной кластеризации (числовой признак \"price\")')\n",
"plt.xlabel('Индекс образца')\n",
"plt.ylabel('Расстояние')\n",
"plt.show()\n",
"\n",
"# Получение результатов кластеризации (только для числовых данных)\n",
"result = fcluster(linkage_matrix, t=100, criterion='distance') \n",
"print(result) # Вывод результатов кластеризации (номера кластеров для каждого образца)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABJ8AAAMQCAYAAACJzMTyAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACbM0lEQVR4nOzdeZxVdf0/8NesgMomIqiouOLCIiYulaVoZm5f/drivuVXzSVzyfKXpWaZFe6oaWYu6dclzeWbZblX7vuKpoIJygAiIAjMMNzfHzQTwz4wh5nB5/Px4MG955x7zvuce865577mcz63rFQqlQIAAAAABShv7QIAAAAAWHEJnwAAAAAojPAJAAAAgMIInwAAAAAojPAJAAAAgMIInwAAAAAojPAJAAAAgMIInwAAAAAojPAJAAAAgMIInwBYZgcffHAOPvjgBY6766670q9fv3z/+99fzlUBS+uYY47Jbbfd1tpl0EJ+//vf56ijjmrtMgD4FBM+AVCYCRMm5Nxzz23tMoBmuOOOO1JTU5N99923tUuhhey7774ZP358fv/737d2KQB8SgmfACjMj3/843zyySdZaaWVWrsUYAnMmDEjw4YNyzHHHJPycpeJK4qysrIcffTRueCCCzJjxozWLgeATyFXFQAU4i9/+Uvuu+++HHfccenevXuTcbNnz85VV12VL33pS+nfv3++/OUv54YbbmgyzcEHH5zvf//7+dWvfpXPfvaz+cxnPpNjjz02Y8aMaTLd/fffnwMOOCCDBw9O//79s+uuu+bGG29sHP/kk0+mX79++fvf/54DDzwwAwcOzC677JKbbrqpcZqf/exn6devX5544onGYXfccUf69euXO++8s7GeeW8tPP/889OvX7/ccccdSZJ+/frl0ksvbTLNpZdemn79+s1X83//939nwIAB+dznPpef/OQn+eSTT5pM88ILL+SII47IlltumW233TYnn3xyampqmqzTk08+mSR58803s/POO2e//fZb4u2SJFdddVV22mmnbLbZZunXr1/jv3nXYW7f//73M3To0Mbn11xzTQYPHpybb765yXZb0L+G7ZQkTz/9dL75zW9myJAh6d+/f4YOHZpLL700s2fPbpxm6tSpOeecc7L99ttniy22yL777puHH3648f1Y2HLm3i5HH310ttxyy2y55ZY57rjj8t577zXOf0n2jWT+97VUKmW//fZLv379Mnr06CTJzJkzc9ZZZ2W77bbLNttsk1NPPTWTJ09ufM2MGTNy/vnnZ5dddkn//v2z5ZZb5vDDD8/rr7++0G2bJKNHj26y7eZ93rDsnXbaqcl+9s9//rNx+867fRbl9ttvz8yZM7Pjjjs2GX7hhRcucFvPu6/cdttt2X333dO/f//ssMMOufTSS1NfX9+sdUySf/zjHwtc3tzH4Pe///0cfPDB+f3vf58dd9wxgwcPzqGHHpoRI0Y0mf+oUaPy7W9/O5/73OeyxRZb5OCDD86zzz473/Ln/tdQY79+/fK73/0u3/ve9zJ48OB89rOfzU9/+tPMnDmz8fX19fW56qqrsscee2TgwIHZYostst9++zU5nzScBwYPHpza2tom9X37299ucmvy3PXcfffdTaZ96KGH5nsfl2T5SbLjjjtm5syZuf3227Mwizp+536vx40bl9NPPz1f/OIXM3DgwHz1q1/NAw88sND5zj3vhmOmwdChQ5vclr2knw/zno/nPS8miz8HALD8VLZ2AQCseCZNmpSzzz47m2++eY488sjceuutTcafddZZueOOO3L00Udn8ODBefrpp3PuuedmypQpOe644xqne+CBB9K9e/ecccYZmT17ds4///wcfPDB+eMf/5hOnTrl4YcfznHHHZdDDjkkJ5xwQmbMmJGbbropP/7xj9O/f/8MGjSocV4nnXRS9t577xxzzDF54IEHcvbZZydJDjjggJx00kl5+OGHc+aZZ+aee+7JhAkT8tOf/jRf+cpXsvfeey9wHf/1r3/l2muvbfa2ueeee3Lqqadmzz33zHe+852MGTMmF154Yd5666389re/TVlZWV577bUcdNBBGTRoUH7xi1+kvr4+559/fr75zW82hmFz++Uvf5n+/fvnW9/6VpIs0Xa58847c/755+foo4/Odtttl06dOiVJvvGNbyzxutTU1OSCCy7Ij3/843zxi19sMm748OHp2bNnkmT8+PE5/vjjG8eNGDEihx12WHbddddceOGFKZVKueeeezJ8+PCsv/762X333VNfX58jjjiiMThYf/3184c//CHHHXdcrrvuupx55pmZOnVqY81f/epX87WvfS1JsuGGG2bkyJHZb7/9sv766+fnP/95Zs2alSuuuCL7779/7rrrrvTo0aOxnkXtGwty11135fnnn5/vPbjzzjvzwx/+MF26dMnZZ5+ds846KxdeeGGS5LTTTsszzzyTk08+Oeuss07efffdXHzxxTnllFPyxz/+MWVlZUu83ed19dVXz/eF/lvf+laqq6tzzjnnZPXVV095eXluu+22xd52dffdd2eHHXZIdXV1k+EzZszI0KFDc/TRRzcOm3dfufLKK3PhhRfmoIMOyumnn57XX389l156aT744INm3347Y8aM9O7dOxdffHHjsIb3ZW6vv/563nnnnZx88snp2rVrLrnkkhx00EG59957s/rqq+ett97K17/+9fTt2zdnnHFGqqqqcv311+fQQw/NNddck6233rrJNtthhx2SpMn6X3zxxRk0aFAuuuiivP3227nooosyfvz4XHTRRUmSYcOG5X//939zyimnpF+/fqmpqclll12WE088MQ8//HDjsZXMaYH0+OOPNx4v06ZNyyOPPLLAVmYrr7xyHnzwwey1116Nw+69996Ul5c3CWmXdPkdOnTIjjvumHvuuScHHnjgIrf/3Mdv0vS9njBhQr761a+mQ4cOOemkk9K9e/fccccdOe644/KLX/yiSb1LY0k/HxanOecAAIonfAKgxZ177rmZPHlyfvOb36SysulHzciRI3Prrbfm5JNPbuwA9/Of/3zKyspy5ZVX5oADDmhsKTV9+vTccccdWXvttZMk66+/fvbZZ5/ceeed2X///fPWW29ln332yQ9+8IPG+Q8ePDjbbLNNnnzyySbh05e+9KXG6bbffvuMGzcul19+efbff/907Ngx5513Xg444IBcddVVee6557LKKqss8Mvu3Ou40UYb5dVXX20cVl5enlmzZi30NaVSKcOGDcv222+fYcOGNQ7v27dvDjvssDzyyCPZYYcd8qtf/SrdunXLNddckw4dOiRJVl999Zxyyin55z//2WSe7777bv7+97/n7rvvzkYbbZQkS7RdXnrppXTr1i0nn3zyQutdnJtvvjmbbLJJ/vu//3u+cZtuumn69OmTJPMFIyNGjMhnP/vZ/PKXv2z80v25z30uDz74YJ588snsvvvuefTRR/Piiy/msssuy84775wk2XbbbfPee+/liSeeaBJmJUnv3r2zxRZbND4/88wz06lTp1x77bVZZZVVkiTbbbdddt5551x99dX53ve+1zjtovaNeUOhadOmZdiwYdl8882bvPelUimnnXZaYz9Jzz33XGOH3bW1tZk2bVrOOOOM7LbbbkmSrbfeOlOnTs15552XCRMmNPmi3xwffPBBfv3rXzepZ+LEiXnvvffywx/+MLvuumvjtH/7298WOa+pU6fm5Zdfzle+8pX5xk2fPj1rrrlmk208t48//jiXX355vvGNb+SMM85IMue47tatW84444wcfvjhjfvnkpg+fXq6dOnSZHkN7+O8y/3Vr36VrbbaKkkycODA7Lzzzrn++utz6qmnZvjw4amurs7111/f+Poddtghe+yxR37xi180CePWWWedBa7fqquuml/96leprKzMF7/4xZSXl+dnP/tZTjjhhGywwQYZN25cTjrppCYtcTp06JATTjghb7zxRpN5fuELX8gDDzzQGD49+OCD6dmzZ5Mwae5p//a3v6W2tjbV1dWZOXNmHnjggQwZMqRJ657mLH/AgAG59957M3Xq1AVuzwZ
"text/plain": [
"<Figure size 1200x800 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Выбираем подмножество данных для кластеризации\n",
"features = df_encoded[['price']]\n",
"\n",
"# Масштабирование числовых признаков\n",
"scaled_features = scaler.fit_transform(features)\n",
"\n",
"# Построение дендрограммы\n",
"linkage_matrix = linkage(scaled_features, method='ward') \n",
"\n",
"plt.figure(figsize=(12, 8))\n",
"dendrogram(linkage_matrix, labels=df.index, leaf_rotation=90, leaf_font_size=10)\n",
"plt.title('Иерархическая кластеризация (дендрограмма) по цене')\n",
"plt.xlabel('Индекс товара')\n",
"plt.ylabel('Евклидово расстояние')\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Визуализация распределения кластеров**"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAACbQAAAuoCAYAAAAwk66tAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzde5RXZaH/8c/IHQEBEUGJULMBRAQRBANBMjNFraRfWSIaiBc85A2VIiXFCsFIEFS85V1OXtC8YmVapnjJu6JmYKDcFEVBZBDm94eLOY5cRIWZ3Tmv11qzFrO/ez/f57u/s9bjcr3Xs0vKy8vLAwAAAAAAAAAAANVsi+qeAAAAAAAAAAAAACSCNgAAAAAAAAAAAApC0AYAAAAAAAAAAEAhCNoAAAAAAAAAAAAoBEEbAAAAAAAAAAAAhSBoAwAAAAAAAAAAoBAEbQAAAAAAAAAAABSCoA0AAAAAAAAAAIBCELQBAAAAQBUrLy+v7in8r+J+AgAAAPzvIWgDAAAAYL0GDBiQ0tLSSj8dOnRInz598otf/CJLlizZ4PVz585NaWlpbrnlliqa8ebx8MMP54QTTkivXr2y22675Zvf/GbGjBmTt9566zOP9cQTT2TIkCGbYZbVZ9asWRk1alT23XffdOzYMX369MnJJ5+cmTNnVjqvb9++OeOMMzbpe7/yyis57LDDNumYAAAAAFSfmtU9AQAAAACKrX379jnrrLMqfl+5cmWef/75/OY3v8mLL76YG264ISUlJeu8tnnz5pk6dWpat25dVdPd5MaNG5fLLrss+++/f372s5+lcePGeemll3LppZdm+vTpufbaa9OyZcuNHu/3v/99Xn311c0446o1ffr0nHbaadl5551z3HHHpVWrVpk/f36uuuqq/L//9/9y0UUX5Wtf+9pme/977rknTz755GYbHwAAAICqJWgDAAAAYIMaNGiQTp06VTrWtWvXLFu2LBMmTMjTTz+91utr1K5de72v/Se48847c+mll2bEiBE58sgjK4537949vXv3zne+852ce+65ufDCC6tvktXo3//+d04//fT06tUrv/3tb1OjRo2K1/bbb78cdthhOf300/PnP/85tWvXrsaZAgAAAPCfwiNHAQAAAPhcOnTokCR54403knz0eNJTTz01w4YNS6dOnXLUUUet85Gj//rXv3LCCSekW7du6dq1a4455phKO5atWLEi5513Xnr37p0OHTrkoIMOyl133bXeeaxYsSJdunTJmDFjKh3/8MMP071794wePTpJ8txzz2XgwIHp0qVLOnfunCOPPDJPPfXUBj/jlClT8pWvfCUDBw5c67U2bdpk+PDh6dy5c8rLy5Mkixcvzi9+8Yvss88+6dChQ7p165ahQ4dm7ty5SZIzzjgjt956a15//fVK92VjPvPKlSszbty47L333unYsWMGDRqUadOmpbS0tGL8JHnooYfywx/+MF26dMmee+6ZU045JfPmzat4/ZZbbkn79u3z+9//Pl/72tfSrVu3XHfddSktLc2sWbMqvedtt92Wdu3aVbr+46655pqUlZVl5MiRlWK2JKlXr15OP/30HHrooet8NO2MGTNSWlqaGTNmVDo+YMCADBgwoOL3DX1vEydOrIgJS0tLM3HixCTJ6tWrM2XKlHzjG99Ihw4d8s1vfjPXXHPNWu/zyb/XJLnjjjty8MEHp2PHjunevXtOPfXULFiwYJ2fHwAAAIBNT9AGAAAAwOeyJn760pe+VHHs7rvvzpZbbpmLLroogwcPXuuaBQsW5Pvf/35mz56dUaNGZezYsXnzzTczcODAvPPOOykvL8/QoUNz44035qijjspFF12Uzp0756STTsq0adPWOY86derkm9/8Zu6+++6KsCz5KOx6++23c8ghh2Tp0qUZPHhwmjRpkokTJ2b8+PFZvnx5Bg0alPfee2+d4y5atCgzZ85Mnz591vtI1R/+8IcZNGhQSkpKUl5enmOOOSYPPfRQTj311Fx++eU54YQT8vDDD1c8svX4449P7969s80222Tq1Knp06fPRn/mM888M1dddVUOP/zwTJo0Kc2aNcvPf/7zSvOZNm1afvzjH6dly5b5zW9+kxEjRuTJJ5/M97///bz11lsV561atSpXXHFFzj333IwYMSL9+vVLnTp1ctttt601Xo8ePdb7SNW//vWvad++fbbddtt1vt6jR4+cdNJJ2Wabbdb5+qf5tO/te9/7Xvr3758kmTp1ar73ve8lSUaNGpUJEybk4IMPzsUXX5z9998/v/zlLzNp0qRK43/y7/WJJ57Iaaedlv32269iZ75HHnkkp5xyyueaPwAAAACfnUeOAgAAALBB5eXl+fDDDyt+X7JkSR599NGK8GrNTm1JUqtWrfziF7+oeLzkx3cOS5Lf/e53KSsry5VXXlkRObVt2zaHHXZYnn766dSsWTN//etfM378+BxwwAFJkl69emX58uUZN25c+vXrl5o11/5fWoccckhuvvnmPPHEE9ljjz2SfPS40B133DG77rprnnrqqbz99ts54ogjsvvuuydJdtxxx0ydOjXLli1Lw4YN1xpzza5krVq12qj7tHDhwopdydbMYc8998y///3vTJ06NUnSunXrNG3atNKjWB966KFP/cxvvPFGbr311px++ukVO4n16tUrb775Zv72t78l+WhXsnHjxqVnz545//zzK+a1++6754ADDsjll1+e0047reL4sccemz59+lT8/o1vfCO33357fvKTn6SkpCTz58/PI488krFjx673M8+fPz/t2rXbqPvzefzzn//c4PfWokWLtGjRIkkq7uesWbPy3//93zn55JMzZMiQJEnPnj1TUlKSSy65JD/84Q/TpEmTJGv/vU6ZMiV169bNkCFDKo41btw4zz77bMrLy9cbNgIAAACw6dihDQAAAIANeuyxx7LLLrtU/Oy11145+eST06FDh5x//vmVIp8dd9yxIgRalyeeeCKdOnWqtGNXixYtcv/996d37955+OGHU1JSkt69e+fDDz+s+Onbt28WLVqUV155ZZ3jduvWLdttt13uvPPOJB89wvOPf/xjDjnkkCTJzjvvnKZNm+bYY4/NmWeemfvuuy/NmjXL8OHDK4KoT1oTzq1evXqj7tO2226bq6++Ol26dMncuXPz0EMP5Zprrsk//vGPlJWVrfe6jfnMM2bMSHl5efbff/9K1/br16/i37NmzcqiRYsqHUs+iug6d+6cRx99tNLxT4Zo/fv3z+uvv57HH388yUe7s2255Zb5xje+sd6516hRI6tWrdrwjfkCPs/39sgjj6S8vDx9+/Zd636uWLEiTzzxRMW5n/x77dq1a5YvX55+/frl/PPPz+OPP56ePXvmhBNOELMBAAAAVBE7tAEAAACwQbvsskt+8YtfJElKSkpSp06dtGzZMg0aNFjr3C233HKDY73zzjsb3PFszWNH1+zG9UkLFy5c545gJSUlOeigg/L73/8+I0eOzP3335/3338/Bx10UMW8rrvuulx00UW5++67M3Xq1NStWzeHHHJIRo4cuc4Ir2XLlikpKcnrr7++3vkuWbIkNWvWrPjct99+e37zm99k3rx5ady4cdq1a5e6det+6j35tM+8ePHiJMnWW29d6bWP//7OO+8kSZo1a7bWGM2aNcsLL7xQ6Vj9+vUr/d69e/e0atUq06ZNS9euXTNt2rQccMABqVOnznrnvt122+WNN95Y7+srV67MkiVL1jmnjfF5vrc19+HAAw9c55gLFiyoNP7Hde7cOVOmTMnvfve7XHnllZkyZUqaNWuWY489NgMGDPhcnwEAAACAz0bQBgAAAMAGbbnlltl11103yVgNGzasiLM+7uGHH06rVq3SsGHD1K9fP1dfffU6r//yl7+83rEPOeSQXHLJJZkxY0buuuuudO3aNdtvv33F6zvuuGPGjh2bVatW5Zlnnsltt92WG264Ia1bt87gwYPXGq9JkybZZZdd8te//jXDhw9f5w5dF154YW688cbcf//9mT17dk4//fQMGDAggwYNyrbbbpskOe+88yrtCraue/J
"text/plain": [
"<Figure size 2500x3000 with 4 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import pandas as pd\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.cluster import KMeans\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"\n",
"# Закодирование категориальных переменных\n",
"df_encoded = pd.get_dummies(df_cleaned, drop_first=True)\n",
"\n",
"# Выбор подмножества данных для кластеризации\n",
"features = df_encoded[['price']]\n",
"\n",
"# Масштабирование данных\n",
"scaler = StandardScaler()\n",
"scaled_features = scaler.fit_transform(features)\n",
"\n",
"# Кластеризация данных\n",
"kmeans = KMeans(n_clusters=3)\n",
"df_encoded['Cluster'] = kmeans.fit_predict(scaled_features)\n",
"\n",
"# Визуализация кластеров\n",
"plt.figure(figsize=(25, 30))\n",
"\n",
"# Парный график 1: Price vs Category\n",
"category_columns = [col for col in df_encoded.columns if col.startswith('category_')]\n",
"if category_columns:\n",
" plt.subplot(4, 1, 1)\n",
" sns.scatterplot(x=df_encoded['price'], y=df_encoded[category_columns[0]], hue=df_encoded['Cluster'], palette='Set1', alpha=0.6)\n",
" plt.title('Price vs Category Clusters')\n",
" plt.xlabel('Price')\n",
" plt.ylabel(f'Category ({category_columns[0]})')\n",
"else:\n",
" plt.subplot(4, 1, 1)\n",
" plt.text(0.5, 0.5, 'No category columns found', ha='center', va='center', fontsize=12)\n",
" plt.title('Price vs Category Clusters')\n",
"\n",
"# Парный график 2: Price vs Sub-Category\n",
"sub_category_columns = [col for col in df_encoded.columns if col.startswith('sub_category_')]\n",
"if sub_category_columns:\n",
" plt.subplot(4, 1, 2)\n",
" sns.scatterplot(x=df_encoded['price'], y=df_encoded[sub_category_columns[0]], hue=df_encoded['Cluster'], palette='Set1', alpha=0.6)\n",
" plt.title('Price vs Sub-Category Clusters')\n",
" plt.xlabel('Price')\n",
" plt.ylabel(f'Sub-Category ({sub_category_columns[0]})')\n",
"else:\n",
" plt.subplot(4, 1, 2)\n",
" plt.text(0.5, 0.5, 'No sub-category columns found', ha='center', va='center', fontsize=12)\n",
" plt.title('Price vs Sub-Category Clusters')\n",
"\n",
"# Парный график 3: Price vs Category (другая категория)\n",
"if len(category_columns) > 1:\n",
" plt.subplot(4, 1, 3)\n",
" sns.scatterplot(x=df_encoded['price'], y=df_encoded[category_columns[1]], hue=df_encoded['Cluster'], palette='Set1', alpha=0.6)\n",
" plt.title('Price vs Category Clusters')\n",
" plt.xlabel('Price')\n",
" plt.ylabel(f'Category ({category_columns[1]})')\n",
"else:\n",
" plt.subplot(4, 1, 3)\n",
" plt.text(0.5, 0.5, 'Not enough category columns found', ha='center', va='center', fontsize=12)\n",
" plt.title('Price vs Category Clusters')\n",
"\n",
"# Парный график 4: Price vs Sub-Category (другая подкатегория)\n",
"if len(sub_category_columns) > 1:\n",
" plt.subplot(4, 1, 4)\n",
" sns.scatterplot(x=df_encoded['price'], y=df_encoded[sub_category_columns[1]], hue=df_encoded['Cluster'], palette='Set1', alpha=0.6)\n",
" plt.title('Price vs Sub-Category Clusters')\n",
" plt.xlabel('Price')\n",
" plt.ylabel(f'Sub-Category ({sub_category_columns[1]})')\n",
"else:\n",
" plt.subplot(4, 1, 4)\n",
" plt.text(0.5, 0.5, 'Not enough sub-category columns found', ha='center', va='center', fontsize=12)\n",
" plt.title('Price vs Sub-Category Clusters')\n",
"\n",
"# Настройка графиков\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## KMeans (неиерархическая кластеризация) для сравнения"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Центры кластеров:\n",
" [[ 194.76055021]\n",
" [ 696.35470625]\n",
" [1847.9773913 ]\n",
" [5430.2962963 ]]\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABi8AAASgCAYAAACAO9vxAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3yN9///8efJTiSxJXaMir1KbGIr2lpttShVm9qjOlTtqhmbRtUqRa1SVK1+tEI60KKlRikROyJTcn5/+OV8RWIdOec65XG/3T63T3Jd73Ndz+udS3quvM77/TaZzWazAAAAAAAAAAAAHIST0QEAAAAAAAAAAADuRvECAAAAAAAAAAA4FIoXAAAAAAAAAADAoVC8AAAAAAAAAAAADoXiBQAAAAAAAAAAcCgULwAAAAAAAAAAgEOheAEAAAAAAAAAABwKxQsAAAAAAAAAAOBQKF4AgI2YzWajI8CBcX84Nn4+AADgWcR7IPxXcK8CzwaKF8BTpEOHDurQoUOa7dHR0Xr11VdVunRpbd++3dI2MDBQbdu2ve/xBgwYoMDAQL377rs2y2wr8fHxWrRokVq3bq3nn39eQUFBatu2rdatW5fqTc6MGTMUGBiYoedOSEjQuHHjtHHjxgw53v1+rvZw7tw5BQYGpvpf8eLFVaFCBbVq1UqrV69+6DFs0cf29qj306OaPXu2QkNDbZDUOD169NCqVask/d/PfNGiRem2fffdd1WvXr0nPue7776b5t4sX768XnzxRc2cOVNxcXGPfcyIiAh169ZN//7772O9bvr06Ro5cuRjnw8AgGcdzzD/h2eYjMEzjHT79m0tWrRILVu2VPny5VWhQgW1bNlSCxcuVEJCwmMfLzAwUDNmzLA6T3JyslatWqV27dqpSpUqqlixolq2bKklS5ZYlWfVqlX65JNPrM4D4L/DxegAAGwrOjpaXbp00bFjxzRr1izVqVPHss/JyUm//fabIiIi5O/vn+p1MTEx2rlzp73jZojLly+rS5cuunDhgjp06KCyZcsqOTlZO3fu1Lvvvqvw8HCNHj1aJpPJJuePjIzUF198ofHjx2fI8T766KMMOc6T6Nmzp4KDgyXd+YTLrVu3tGrVKr3//vu6ffv2Ax8gX3nlFdWqVctOSTOeLe6n6dOnq0+fPjZMbV9ff/21Ll68qNatW6faPnXqVNWtW1cFCxa02blz5sypmTNnSrrzUHTz5k2Fh4dr3rx5+t///qcvvvhC7u7uj3y8H3/8Ubt3737sHN26dVPjxo3VuHFjVatW7bFfDwAA/g/PMDzDZIRn+Rnmww8/1LZt29StWzeVLl1aycnJCg8P17Rp0/Tzzz9r1qxZdssSGxurHj166ODBg3r99dfVpUsXubq6at++fZo4caL27NmjWbNmyc3N7ZGPOWfOHAUFBdkwNQBHQfECeIqlvOk/evSo5syZoxo1aqTaX7JkSZ04cUJbtmxRp06dUu3buXOnPD095evra8fEGWPYsGGKiIjQypUrFRAQYNkeHBysPHnyaMqUKapbt67q169vXMjHULRoUaMjqECBAipfvnyqbdWrV9exY8e0aNGiB77x9/f3T/Ng+V/ytN1PGS0uLk6TJk3SRx99JCen1AM63dzc9N5772np0qU2e9B2c3NLc2/WqVNH5cqVU+/evbVw4UL17NnTJue+m6enpzp27Kjx48drw4YNNj8fAABPK55hno73nDzDGOf8+fNau3atRo0apVdffdWyvVatWsqWLZvGjRunQ4cOqWzZsnbJM378eP3yyy9asmRJqp9HzZo1Vbx4cQ0aNEgrVqzQm2++aZc8AP5bmDYKeErdunVLXbt21Z9//qn58+enedMvSV5eXqpTp462bNmSZt/mzZvVuHFjubikrnEmJydr/vz5atiwoUqXLq3GjRtryZIlqdokJSVp/vz5at68ucqWLavy5curbdu22rdvn6XNjBkz1LBhQ+3atUsvvvii5Vjr1q1LdawvvvhCTZo0UZkyZVSrVi2NHDlS0dHR973uo0eP6n//+5/efvvtVG/6U3Tq1Ent2rWTl5dXuq+vV69emiHmX3/9tQIDA3Xu3DlJd/5YO3LkSNWuXVulS5dWkyZNLFMAnTt3zvJAMXz48FRT44SHh6t9+/YqV66cgoKCNGzYMF29ejXVeUqWLKlVq1apRo0aCgoK0okTJ9IMuQ4MDNSyZcv0/vvvKygoSBUqVFC/fv10+fLlVLlDQ0NVv359lS1bVm3bttWOHTsUGBiosLAwS9YnGf7r5OSkEiVK6Pz586mO9/nnn6tJkyYqV66c1qxZk+6Q63Xr1qlly5YqV66cgoODNXny5FTDhf/66y91795dFStWVMWKFdW7d2+dPXv2vlk2btyowMBA/fXXX6m2b9++XYGBgTpy5Igk+9xPBw4c0Ntvv63KlSurdOnSqlevnmbMmKHk5GRJsvTFzJkzU/XLo1zz33//ra5du6pixYqqXr26pk6dquHDh6e6P+Lj4zVr1izLdTZq1Ejz58+3nF+6M4x/8ODB6tu3r8qXL6+33npLrVu3TvcBrlOnTnrrrbfu20dr1qxRfHy86tatm2ZfyqcEFy9efN/Xp9i7d6/eeOMNPf/886pSpYoGDRqkCxcuPPR199OgQQOVL19eK1assGx72O+mr7/+WsOHD5ck1a9f3/K7IC4uTpMnT1ajRo1UunRpVaxYUW+99ZaOHj2a6pzNmzfX8ePHtWvXLqtzAwDwLOMZhmcYnmGe/Bnm8uXLMpvNqd7/p3jxxRc1cOBAS4Hv3vskRXr3VHR0tAYPHqwKFSqoWrVqGjNmjGJjY++bQ5KuXr2qNWvWqHXr1mkKSdKd98+dO3eWn5+fZduxY8fUp08fVa1aVaVKlVKtWrU0ZswYy5Sw9erV07///qu1a9emyn7+/HkNHDhQQUFBKleunDp27GjpwxSRkZEaMGCAgoKCVLlyZY0YMUJTp05Ndc8nJSVp2bJlevHFF1W2bFkFBwdr0qRJio+Pt7R599131bFjR3300UeqWLGimjZtqr59+6p27dpp+v39999X48aNH9hPAO6PkRfAUygmJkbdunXTkSNHFBoaqkqVKt23bdOmTdW/f/9Uw66jo6O1Z88eff7559qzZ0+q9iNHjtTXX3+t7t27q0KFCjpw4IDGjRunqKgo9e7dW5I0adIkffnllxo0aJACAwN18eJFzZo1S/369dOuXbvk6ekpSbp06ZJGjRqlnj17Km/evAoNDdWwYcNUpkwZFSlSRN98840+/fRTDRs2TIGBgTp58qQ++eQTxcbG3nd+yx9++EGS7jufvru7u0aMGPF4HXqPcePG6X//+5+GDRumHDlyaM+ePZo4caKyZMlimWu/T58+6tmzpxo1aiTpzh+033rrLVWtWlXTpk3TjRs3NH36dL355ptavXq1PDw8JN15o7Rw4UKNHTtW165dU5EiRdLNMHXqVDVs2FBTpkzR2bNnNX78eDk7O2vKlCmS7vxhfNasWXr77bdVtWpV/fDDD+rfv3+qY+TKlUsrV658ok8UnTp1SgUKFEi1bcaMGXr//ffl7e2tcuXKWdZBSLFs2TKNGjVKr7zyigYOHKizZ89q4sSJunHjhkaNGqVTp06pbdu2Kly4sD755BPdvn1bc+bM0euvv67169cre/bsaXI0aNBAXl5e2rRpk4oVK2bZ/s033+i5555TyZIl7XI/HTt2TJ06dVKTJk00depUmc1mbdy4UTNnzlThwoXVrFkzrVy5Uq+99pratGmjV155xdKPD7vmq1evqn379sqePbvGjx+vpKQkTZ8+XefPn7c8CJjNZvXo0UO//fab+vTpo+LFiyssLEzTpk3T2bNnNXr0aEvWb7/9Vi+99JLmzJmj5ORk/fPPPxo5cqTOnDljmebpwoULCgsL08SJE9O9fknasGGDgoOD0x3m3bp1a23ZssUyfdS990qKdevWadiwYWrevLm6d++ua9euKSQkRK+99prWrl2b7s/8UdSoUUOzZs3Sv//+q7x58z70d1NwcLB69uypOXPmpCouDR06VOHh4Ro4cKAKFCi
"text/plain": [
"<Figure size 1600x1200 with 4 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import pandas as pd\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.cluster import KMeans\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"\n",
"# Загрузка данных\n",
"df = pd.read_csv(\"..//static//csv//jio_mart_items.csv\")\n",
"df = df.iloc[:15000]\n",
"\n",
"# Удаление несущественных столбцов\n",
"columns_to_drop = ['href', 'items']\n",
"df_cleaned = df.drop(columns=columns_to_drop)\n",
"\n",
"# Закодирование категориальных переменных\n",
"df_encoded = pd.get_dummies(df_cleaned, drop_first=True)\n",
"\n",
"# Выбор подмножества данных для кластеризации\n",
"features_used = ['price']\n",
"data_to_scale = df_encoded[features_used]\n",
"\n",
"# Масштабирование данных\n",
"scaler = StandardScaler()\n",
"data_scaled = scaler.fit_transform(data_to_scale)\n",
"\n",
"# Кластеризация данных\n",
"random_state = 42\n",
"kmeans = KMeans(n_clusters=4, random_state=random_state)\n",
"labels = kmeans.fit_predict(data_scaled)\n",
"centers = kmeans.cluster_centers_\n",
"\n",
"# Отображение центроидов\n",
"centers_original = scaler.inverse_transform(centers) # Обратная стандартизация\n",
"print(\"Центры кластеров:\\n\", centers_original)\n",
"\n",
"# Визуализация результатов кластеризации KMeans\n",
"plt.figure(figsize=(16, 12))\n",
"\n",
"# Парный график 1: Price vs Category\n",
"plt.subplot(2, 2, 1)\n",
"category_columns = [col for col in df_encoded.columns if col.startswith('category_')]\n",
"if category_columns:\n",
" sns.scatterplot(x=df_cleaned['price'], y=df_encoded[category_columns[0]], hue=labels, palette='Set1', alpha=0.6)\n",
" plt.title('KMeans Clustering: Price vs Category')\n",
" plt.xlabel('Price')\n",
" plt.ylabel(f'Category ({category_columns[0]})')\n",
"else:\n",
" plt.title('KMeans Clustering: Price vs Category (No Data)')\n",
" plt.xlabel('Price')\n",
" plt.ylabel('Category')\n",
"\n",
"# Парный график 2: Price vs Sub-Category\n",
"plt.subplot(2, 2, 2)\n",
"sub_category_columns = [col for col in df_encoded.columns if col.startswith('sub_category_')]\n",
"if sub_category_columns:\n",
" sns.scatterplot(x=df_cleaned['price'], y=df_encoded[sub_category_columns[0]], hue=labels, palette='Set1', alpha=0.6)\n",
" plt.title('KMeans Clustering: Price vs Sub-Category')\n",
" plt.xlabel('Price')\n",
" plt.ylabel(f'Sub-Category ({sub_category_columns[0]})')\n",
"else:\n",
" plt.title('KMeans Clustering: Price vs Sub-Category (No Data)')\n",
" plt.xlabel('Price')\n",
" plt.ylabel('Sub-Category')\n",
"\n",
"# Парный график 3: Price vs Category (другая категория)\n",
"plt.subplot(2, 2, 3)\n",
"if len(category_columns) > 1:\n",
" sns.scatterplot(x=df_cleaned['price'], y=df_encoded[category_columns[1]], hue=labels, palette='Set1', alpha=0.6)\n",
" plt.title('KMeans Clustering: Price vs Category')\n",
" plt.xlabel('Price')\n",
" plt.ylabel(f'Category ({category_columns[1]})')\n",
"else:\n",
" plt.title('KMeans Clustering: Price vs Category (No Data)')\n",
" plt.xlabel('Price')\n",
" plt.ylabel('Category')\n",
"\n",
"# Парный график 4: Price vs Sub-Category (другая подкатегория)\n",
"plt.subplot(2, 2, 4)\n",
"if len(sub_category_columns) > 1:\n",
" sns.scatterplot(x=df_cleaned['price'], y=df_encoded[sub_category_columns[1]], hue=labels, palette='Set1', alpha=0.6)\n",
" plt.title('KMeans Clustering: Price vs Sub-Category')\n",
" plt.xlabel('Price')\n",
" plt.ylabel(f'Sub-Category ({sub_category_columns[1]})')\n",
"else:\n",
" plt.title('KMeans Clustering: Price vs Sub-Category (No Data)')\n",
" plt.xlabel('Price')\n",
" plt.ylabel('Sub-Category')\n",
"\n",
"# Настройка графиков\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### PCA для визуализации сокращенной размерности"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABjAAAAJICAYAAADPWa1BAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3yT5frH8W+SJm26Szd7I3vKUEFABbfgOE4EjgMExwFRRAVRD6gIiCKgKDgAfy4UcYHgFhFFQRyg7F1K927TJL8/OERKBw30aUL7eb9evmyfkefK1bTkyvXc921yu91uAQAAAAAAAAAA+BGzrwMAAAAAAAAAAAA4Hg0MAAAAAAAAAADgd2hgAAAAAAAAAAAAv0MDAwAAAAAAAAAA+B0aGAAAAAAAAAAAwO/QwAAAAAAAAAAAAH6HBgYAAAAAAAAAAPA7NDAAAAAAAAAAAIDfoYEBALWI2+32dQgoR23+2dTm5w4AAICSeG9YO/BzBlBZNDAAaMiQIWrVqlWJ/9q1a6e+ffvq0UcfVWZmZqlzdu7cqcmTJ+v8889Xhw4d1LdvX40dO1Zbtmwp9zrPPPOMWrVqpccff9zIp1Ou2bNnq1WrVj65dlnee+89tWrVSvv27TP8vKKiIk2dOlUffviht2F65brrrlOrVq20cuVKQ6/jbz/LU5GVlaX7779f69ev92wbMmSIhgwZUm0xVPb3uX///nrggQeq9Npbt27V9ddfXyWPtW/fPrVq1UrvvfdelTweAADwH9QsvlGTapZWrVpp9uzZpbb//fff6tWrl84991zt2rXLc2yrVq00c+bMMh/L5XKpd+/ep+17z0OHDmnatGm68MIL1bFjR51zzjkaOXJkiZpEMqYuSUpK0u233679+/dXyeOV93MFUHPQwAAgSWrTpo3eeustz3+vvPKKhg0bpqVLl2rEiBEl7o747LPPNHjwYP3xxx+644479NJLL2nMmDHatWuX/vWvf2nNmjWlHt/lcmnZsmVq2bKlPvjgA+Xn51fn06v1kpOT9dprr6m4uNiwa+zYsUMbNmxQy5Yt9eabbxp2nZpm8+bN+uCDD+RyuTzbHnnkET3yyCPVcv2T+X2uSitWrNCGDRuq5LHi4uL01ltvqW/fvlXyeAAAwL9Qs9Rs1VGzHG/r1q0aNmyY7Ha7Fi9erMaNG3v2mc1mrVixoszzfvrpJyUnJ1dTlFXr559/1hVXXKEvv/xSN998s1544QU99NBDKigo0JAhQ7Rs2TJDr//999/r66+/rrLHe+utt3TNNddU2eMB8D8Bvg4AgH8IDQ1Vp06dSmw788wzlZubq+eee06//vqrOnXqpD179mj8+PHq3bu3Zs2aJYvF4jl+wIABuv766zV+/Hh98cUXstlsnn3fffedkpKSNHPmTN1000366KOPeJNRw7z33nuqV6+eRowYoXHjxmn37t1q1KiRr8M6LTVv3rxarnOyv8/+ymazlfo7BgAAag5qFlSl7du3a+jQoQoJCdFrr72munXrltjfpUsXrV+/Xn/++afatGlTYt/HH3+s1q1ba/PmzdUZ8inLyMjQf/7zHzVu3FivvPKK7Ha7Z9/AgQN1++23a9KkSTrnnHMUExPjw0grj/f/QM3HCAwAFWrXrp0k6cCBA5KkRYsWqaioSA8//HCJQkCS7Ha7xo8fr6uuuqrUEO6lS5eqZcuW6tq1q3r06KG33nrrhNfu37+/pk6dqqFDh6pDhw566KGHJB150zVp0iSdddZZat++vf71r39p7dq1Jc4tLCzUE088obPPPludO3fWhAkTVFhYWOKYsobDrlu3Tq1atdK6des823bs2KE777xT3bt315lnnqkRI0Zo+/btJa41bdo0nXvuuWrXrp0uu+wyffLJJyUe1+Vyae7cuerbt686duyoUaNGlTnM/XiVPW/16tW64YYb1LlzZ7Vr104XXnihlixZIunItDrnnXeeJGnChAnq37+/57x33nlHV155pTp16qQOHTroiiuu0KefflrisVu1anXCaYOcTqeWLVumfv366fzzz1dwcHCZP2OHw6Hp06erT58+6tChg2655RYtW7as1PDy999/XxdffLHat2+vyy+/XGvXrlWbNm0qHJ79ySef6Morr1Tnzp119tlna9KkSSVyNXv2bF144YVatWqVLr30UrVv315XXHGFNmzYoI0bN+qaa65Rhw4ddOmll5Z6Pf39998aMWKEunTpoi5dumj06NHau3evZ//R182bb76pfv36qUuXLp67+irK8bp163TzzTdLkm6++WbP6/HY1+a///1vXXnllaWe76hRo3T55Zd7vl+/fr1uuukmdezYUd27d9f48eOVlpZWbr6kk/99PvY5H/u7cnzskvT7779r6NCh6tq1qzp37qxhw4Zp48aNko78TJ5//nlJJYd+u1wuzZ8/XxdccIHatWungQMHatGiRaWuM27cON19993q1KmThg8fXmoKqffee09t2rTRr7/+qmuvvVbt27dXv379tGDBghKPlZycrDFjxnh+xydNmqRnnnmmxO8KAADwX9Qs1CyVqVmOtX37dt18880KCwvT4sWLSzUvpCPNsZiYmFKjMIqLi/XZZ5/pkksuKXVOZX7uaWlpevTRR9WvXz+1a9dO3bt31+jRo0vUQ0OGDNFDDz2k+fPnq2/fvmrfvr2uu+46bdq0yXNMQUGBJk+erD59+njyefz73OMtW7ZMycnJevDBB0s0L6QjI07GjRunG2+8UTk5OaXOLW+61gceeKDEz2vPnj0aOXKkevTooY4dO+raa6/1jLh47733NGHCBEnSeeedV+Jn9s477+iSSy7xTA03e/ZsOZ3OEtcZOnSoHnnkEXXp0kUXX3yxnE5niTri6O/G2rVr9e9//1sdO3bU2WefraeffrrEY+Xk5GjSpEnq1auXOnfurDFjxujVV1/1q+nbAPyDBgaACu3cuVOS1KBBA0nSt99+qzZt2ig+Pr7M43v16qUxY8YoNjbWsy0jI0NffPGFBg0aJEkaPHiwfvvtN/3xxx8nvP6SJUvUvn17zZ07V1dffbUKCws1dOhQff755xozZoyef/55JSQk6NZbby3xxvC+++7T22+/rREjRmjWrFnKzMzUq6++6vXzP3TokK699lrt2rVLkydP1tNPP62UlBQNHTpUGRkZcrvdGj16tN58800NHz5c8+bN87wBOnbo7dNPP605c+bo6quv1vPPP6/IyEjNmDHjhNevzHlfffWVRo8erbZt22ru3LmaPXu2GjRooMcee0y//vqr4uLiPB8S33HHHZ6vlyxZokmTJun888/Xiy++qOnTp8tms2ncuHFKSkryPP5bb72lUaNGVRjnN998o8OHD2vQoEEKCgrSRRddpPfff19FRUUljps0aZJee+013XTTTZozZ45iYmI0ceLEEscsW7ZMDzzwgLp06aK5c+dq4MCBGjVqVIk3nMebO3euxo4dq06dOum5557T6NGjtXLlSg0ZMkQFBQWe45KSkvTkk09q5MiRevbZZ5WVlaW7775bY8eO1TXXXKM5c+bI7XZrzJgxnvN27typ6667TqmpqXrqqac0ZcoU7d27V9dff71SU1NLxPH8889r/PjxmjRpkjp37nzCHLdt21aTJk3y5KasaaMuv/xy/fHHH9q9e7dnW1ZWlr755htdccUVko4MYR82bJiCgoI0a9YsPfjgg/rxxx918803l3j+xzuZ32dv5OTk6NZbb1VUVJRmz56tZ555Rvn5+brllluUnZ2ta665RldffbWkkkO/J0+erOeee06XX365XnjhBV144YWaOnWq5syZU+LxP/30U4WEhGjevHm69dZby4zB5XLpP//5jy6++GLNnz9fXbp00bRp0/Ttt99KOjLX8tChQ/XLL7/owQcf1BNPPKEtW7Zo4cKFJ/WcAQBA9aNmoWapTM1y1I4dOzR06FCFhoZq8eLF5b5OLBaLBg4cWKqBsXbtWhUWFpa62aUyP3e3260RI0ZozZo1GjdunBYsWKA777xTa9euLVULrFy5Up9//rkefvhhzZw5UykpKbrrrrs8ddH
"text/plain": [
"<Figure size 1600x600 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Применение PCA ТОЛЬКО к числовым данным\n",
"pca = PCA(n_components=1)\n",
"reduced_data = pca.fit_transform(data_scaled)\n",
"\n",
"# Визуализация сокращенных данных\n",
"plt.figure(figsize=(16, 6))\n",
"\n",
"# График 1: PCA reduced data: Agglomerative Clustering\n",
"plt.subplot(1, 2, 1)\n",
"sns.scatterplot(x=range(len(reduced_data)), y=reduced_data[:, 0], hue=result, palette='Set1', alpha=0.6)\n",
"plt.title('PCA reduced data: Agglomerative Clustering')\n",
"plt.xlabel('Sample Index')\n",
"plt.ylabel('Principal Component 1')\n",
"\n",
"# График 2: PCA reduced data: KMeans Clustering\n",
"plt.subplot(1, 2, 2)\n",
"sns.scatterplot(x=range(len(reduced_data)), y=reduced_data[:, 0], hue=labels, palette='Set1', alpha=0.6)\n",
"plt.title('PCA reduced data: KMeans Clustering')\n",
"plt.xlabel('Sample Index')\n",
"plt.ylabel('Principal Component 1')\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Анализ инерции для метода локтя (метод оценки суммы квадратов расстояний)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA2cAAAImCAYAAADXOPIYAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAB86UlEQVR4nO3dd1zVZf/H8fdhgwwFZako7gW4cJtmZkPrTm3dpZWpqVn+SrvtNm1ndSe5szQtG5YjzYbtnZo7J5iiYg6GgoDIhvP7Azl5ZIpwzgFez8eDB/Jd58PhKnl7fb7X12A0Go0CAAAAAFiVnbULAAAAAAAQzgAAAADAJhDOAAAAAMAGEM4AAAAAwAYQzgAAAADABhDOAAAAAMAGEM4AAAAAwAYQzgAAAADABhDOAAAAAMAGEM4AAAAAwAYQzgDAAkaOHKnWrVvr7rvvLvGYxx9/XK1bt9Z///tfC1YGoKJOnjyp1q1ba926ddYuBUANQTgDAAuxs7PT7t27FRcXV2Rfenq6fv75ZytUBQAAbAXhDAAspF27dnJ2dtY333xTZN/PP/8sV1dX+fn5WaEyAABgCwhnAGAhbm5u6tevX7Hh7KuvvtINN9wgBweHIvt++OEHDRs2TCEhIerdu7deeuklpaenS5IGDBig1q1bF/tx8uRJSdKmTZt0zz33qEuXLurevbumTJmi2NhYs9eYMmVKsdcoq12rsF2zuI9L7du3T6NHj1b37t3VuXNnjR8/XocPHzbt37p1q1q3bq2tW7dKkg4dOqSBAwfq7rvv1oIFC0p8jQULFkiS1qxZo5tuukkdOnQw219Wi+jq1auLve6l5xW2rpV1XEVrKO97U9rrl7S/8Ofw3//+VwMGDDB73ZUrV5q9h5e+zs6dO82O/fDDD9W6dWuza2RmZur111/XoEGD1KFDB3Xu3FmjRo1SVFSU2bkl1TVy5EizYwrrKM7l46PQyJEjza6TlZWlN954QzfeeKNCQkI0aNAgLVmyRPn5+WbnXF7L1q1by3VuWYxGo6ZNm6bQ0FBt3Lix3OcBQKGivwUAAKrMzTffrMcee0xxcXHy9/eXJKWlpem3337Tu+++q99++83s+C+++EJPPPGEbrnlFj322GM6deqU5syZo+joaL377rtauHChsrOzdebMGT3yyCOaMGGC+vfvL0ny9fXV+vXr9eSTT2rIkCEaN26czp07p/nz5+uuu+7Sp59+Kh8fH0kFv9TeddddGjZsmCSZrlce7dq107PPPmv6es2aNfrkk09MX2/ZskVjxoxR9+7d9fLLLysrK0uLFy/W3XffrdWrV6t58+ZFrjlr1ix16NBBEyZMkJeXl/r27StJev755yXJ9Hr+/v7avn27ZsyYodtvv10zZsxQnTp1JKlc9WdmZiokJEQzZswwbSvpvEvf28uPq2gNV/LePPPMM2rfvn2xr79q1SpJ0oEDB/TCCy8UOfZyKSkpmjt3brH76tSpo59++kldunQxbfvqq69kZ2f+77lTp07Vjh07NHnyZAUFBen48eOaN2+epkyZog0bNshgMJiOvf3223XHHXeYvi78OVYmo9Go8ePHa/fu3XrkkUfUpk0bbd26VXPnztWJEyf04osvmo69fMw2b9683OeW5qWXXtKXX36pN954Q3369Kn07xFAzUc4AwAL6t+/v1xdXfXNN9/ogQcekCR9//338vHxMftlWCr4ZTMiIkJ9+/ZVRESEaXvTpk31wAMP6NdffzWFhcJZsqCgIHXs2FGSlJ+fr4iICPXp00evv/666fzOnTvr5ptv1rJlyzR16lRJUkZGhpo2bWo6t/B65eHu7m46T5J+//13s/2vv/66mjRpoiVLlsje3l6S1KdPH11//fWaP3++5s2bZ3b88ePHtXHjRn3++edq2bKlJJmCrLu7uySZvd6GDRskSU899ZQpFEmSk5NTmbVnZGSofv36Ztcr6bxL39vLj9u7d2+FariS96ZFixYlvn7h9qysrGKPvdz8+fMVGBioc+fOFdl3zTXX6Mcff9R//vMfSVJcXJz+/PNPde3aVadOnZIkZWdn68KFC5oxY4ZuvvlmSVK3bt2UlpamV199VWfPnlWDBg1M1/T39zerp/DnWJl+++03bd68WbNnz9bgwYMlSb1795aLi4vmzZun++67zzSeLh+zv/76a7nPLcnrr7+uVatWaeHChbrmmmsq/fsDUDvQ1ggAFuTi4qIBAwaYtTZu2LBBN910k9lMgyQdPXpUcXFxGjBggHJzc00f4eHhcnd316ZNm0p9rWPHjunMmTMaMmSI2fagoCB16tRJ27ZtM22LjY2Vh4dHJXyH5tLT07Vv3z7ddNNNpvAhSZ6enrr22mvNaig8fs6cOerevXuZvwwXCg0NlSS98847SkhIUHZ2tnJzc8t1bmV93xWp4Urfm8py6NAhrVq1Sk8//XSx+wcMGKCYmBgdPXpUkvTNN98oLCxMDRs2NB3j5OSkZcuW6eabb1Z8fLy2bNmilStXmha1yc7OvuK68vPzlZubK6PRWOYxhR+XHrtt2zY5ODjoxhtvNDvn1ltvNe0vydWcK0krVqzQkiVLNHjwYLPZVQC4UsycAYCF3XTTTXrkkUcUFxcnZ2dn/fHHH3rssceKHJecnCypoAWsuDawhISEUl+n8Pz69esX2Ve/fn1FRkZKKpihO336tBo1anRl30g5nD9/XkajscQazp8/b7Zt/Pjx8vT0NGuLLEt4eLhmzJihJUuWaOHChVdU36lTp0pt/6vKGq70vaksL730kgYPHqxOnToVu9/Pz08dOnTQjz/+qGbNmumrr77SkCFDTOOl0O+//66XX35ZR48eVZ06ddSmTRu5ublJUqkBqySLFi3SokWLZG9vr/r166tPnz76v//7P7NFcgpnmy/VrVs3SQWtmvXq1TMLupJMM3ilvZ9Xc64kHTx4UH369NGXX36p+++/X+3atSv1eAAoCeEMACzsmmuuUZ06dfTNN9/Izc1NjRo1UocOHYoc5+npKang3p7CX0Av5eXlVerr1K1bV5J09uzZIvvOnDmjevXqSZKioqKUmZlZZBGPyuDh4SGDwVBiDYU1Fpo6daq++eYbTZo0SStWrCh3+9udd96pjRs3Kjc3V88884waNWqkCRMmlHpOfn6+9uzZo+HDh5frNS6f2bzaGq70vakMX3/9tfbv32/W5lqc6667Tj/++KNuuukm7d+/XwsXLjQLZ3///bcmTpyogQMHavHixWrcuLEMBoNWrFhRpK1VKvu9kwrevzvvvFP5+fk6ffq05syZo7Fjx+rzzz83HfP888+bhelL7xvz8vLSuXPnlJeXZxayCv8Ro3C8F+dqzpWk//u//9N9992nwYMHa8aMGVqzZk2RoAcA5UFbIwBYmJOTkwYOHKhvv/1WX3/9tekel8s1a9ZMPj4+OnnypEJCQkwffn5+ev3114vMZFwuODhYDRo00Jdffmm2/cSJE9q9e7c6d+4sSfrll1/Utm1beXt7X/H3kp+fX+ovoW5uburQoYO+/vpr5eXlmbafP39ev/zyS5H77Dp06KCFCxfq1KlTmjVrVrnrmDdvnn755Re9+uqruummmxQSElLm/V67du1Senq6unfvXupxhbNAly+IcbU1XOl7c7Wys7P12muvaeLEiWb3gxVn4MCB2rNnjz788EN16dJFvr6+Zvv379+vrKwsPfTQQwoKCjKFr8JgVvieFa50WNZ7JxUsYBMSEqKwsDDddNNNuvfee/XXX38pJSXFdExwcLDZfwuX3t/XrVs35ebmFlkNtTDclfZ+Xs25UsFMp4uLi5555hkdOHBA7777bpnfLwAUh5kzALCCm2++WePGjZOdnZ3ZSoGXsre31+OPP65nnnlG9vb2uvbaa5WamqpFixYpPj6+zHY8Ozs7TZ48WdOmTdOUKVN066236ty5c1q4cKG8vLw0atQoHThwQCtWrNDgwYO1e/du07lnzpyRVDBDkpSUVCS4JSUlKTo6WsePHzeFvJJMmTJFo0eP1kMPPaR77rlHOTk5WrJkibKzszVx4sQix/v5+emxxx7TzJkzNXz
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Список для хранения инерций\n",
"inertias = []\n",
"clusters_range = range(1, 11)\n",
"\n",
"# Вычисление инерции для каждого количества кластеров\n",
"for i in clusters_range:\n",
" kmeans = KMeans(n_clusters=i, random_state=random_state)\n",
" kmeans.fit(data_scaled)\n",
" inertias.append(kmeans.inertia_)\n",
"\n",
"# Визуализация метода локтя\n",
"plt.figure(figsize=(10, 6))\n",
"plt.plot(clusters_range, inertias, marker='o')\n",
"plt.title('Метод локтя для оптимального k')\n",
"plt.xlabel('Количество кластеров')\n",
"plt.ylabel('Инерция')\n",
"plt.grid(True)\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Расчет коэффициентов силуэта"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA2MAAAImCAYAAADe01JiAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACehUlEQVR4nOzdeVhUZf8G8HsYhn1H2URlUcENN8BcQMxSf1qpaFku5V6mkRuYReaWWm65pGWpqaWW4ZKvS+auaQguuSAoCgYiiyD7MsDM7w9ichxQBud4WO7PdXEJ5zznzPd8o/f17jnnORKlUqkEERERERERPVd6YhdARERERERUHzGMERERERERiYBhjIiIiIiISAQMY0RERERERCJgGCMiIiIiIhIBwxgREREREZEIGMaIiIiIiIhEwDBGREREREQkAoYxIiIiIiIiETCMERE9YuTIkRg5cqTatsjISLz22mto1aoVdu7cKejnr169Gh4eHlofV1HdRFSz8d9bItIXuwAioposPT0d7733Hlq3bo0NGzZUKygRERERVYRhjIjoCTZt2oTCwkJ8+eWXsLe3F7scIiIiqkN4myIRUSUePnyIbdu24dVXX9UIYvHx8QgKCkK3bt3Qvn17jBw5EhcuXFAbc+nSJQwfPhwdOnSAr68vpk+fjpSUFLUxv//+O/r06YO2bdti7NixSE1NBQDs3LkTPXr0QPv27TF16lRkZWWpjlEqlVi1ahW6du2KDh06YO7cuSguLkZxcTE+//xzeHt7o0uXLli5ciWUSqXqOA8PD6xevVrtPG+++SY8PDyQmJgIAPjoo4/w4osvqtWYmJgIDw8P7Nq1q8KfAaCoqAi9evXSmDk8cuQIAgMD0bZtW3Tr1g0LFixAfn6+an9lt2WW11r+WRV9ldf5tFu9Krqmx+Xm5mL+/Pnw8/ND+/btMXjwYJw4cUK1/8UXX8RHH32kdsy0adPg4eGB8PBwtXFPqvWLL76Al5cXcnJy1M61du1adOrUCQUFBSgqKsKyZcvQs2dPtGnTBq+88gr27NkDAAgPD6+0H+U9yMjIwNy5c1XH+/r6YtKkSap/xpUpLi6u9NyPGjly5FPH3bp1C2PHjoWPj88Tz/Wo8n/W+/fvx3vvvYd27dohICAAX3/9NRQKhWpcYWEhli1bht69e6NNmzbo2LEjRo8ejRs3bqjGPHz4ENOmTYOvr6/q966oqEi1//F/FwDN30Vtfq+uXbuG1q1bq/2OpKeno0uXLhg9erTav4dPcvr0abRp0waffPJJlY8hotqNM2NERI9RKpW4f/8+FixYgJKSErz77rtq+2NjY/HGG2/AxcUFoaGhkMlk2LJlC9555x1s3LgRvr6+uHPnDsaMGQNvb2+sWrUKycnJWLZsGUaOHIndu3fD1NQU165dw9SpU/F///d/+OSTT/DHH3/gl19+AQCsWbMGs2bNQkFBARYvXoxPPvkEa9asAQBs3rwZ69atw+TJk9G6dWt89913uHTpEgBAJpNh2bJluHr1Kr7++mtYW1vj7bffrvA69+7dqzruWX3//fcaf9nft28fZsyYgVdffRVTpkzBvXv3sGLFCsTGxmLTpk2QSCRPPa+dnR1+/vlnAMCJEyewbt06rFmzBg0bNoSBgYFOai8tLcWYMWNUAdvNzQ27d+/GpEmTsHnzZnh7e2scExkZif3791d4vh49euD9999X/bx27VrExsYCAIYMGYKNGzfi0KFDeP3111Vj9u7di379+sHY2BihoaHYv38/ZsyYgSZNmmDPnj2YOXMmCgsL8corr6j6sXPnTvz666+qn83MzKBUKvHuu+8iKysLM2bMQIMGDRATE4OvvvoKn332GTZs2FBpH8rDyrp162BjY6P2GY9r1aoVPvvsM9XPj4+bOHEiDAwMMH/+fNjZ2UFPT6/Scz1uzpw56NGjB1avXo0LFy5gzZo1yM/PR3BwMAAgJCQEkZGRmDZtGpo0aYK7d+9i5cqVmD59Ovbv3w+JRIJZs2bh6tWrmDt3LuRyOebMmQMjIyPMmDHjqZ9fHW3atMH48eOxbt06DBgwAF26dMHs2bOhUCiwePHiKv2uR0REYPLkyXj11VexYMGCKh1DRLUfwxgR0WMiIiIQEBAAmUyG7777Di4uLmr716xZAwMDA2zZsgVmZmYAgICAALzyyiv48ssv8euvv+LQoUPQ19fHqlWrYGxsDABwdHTE2LFjsWvXLowcORLfffcdGjVqhCVLlkBPTw/+/v6IiIhAXFwcPv74Y/Tp0wcAIJFI8NFHH+H27dtwcXHB999/j4EDB2LSpEkAgJYtWyIgIAASiQQrVqxAgwYN0KNHD9y7dw/ff/89hg8fDqlUqnYNeXl5WLp0KVq3bo3r168/U7/u37+P7777Tu1cSqUSS5cuhZ+fH5YuXaoa6+LiglGjRuHkyZMICAh46rkNDAzQvn17AMCdO3dU1+vs7PxMNT/q1KlT+Pvvv/H111/jpZdeAgC88MILSEhIwF9//aURxhQKBRYsWFBp72xsbFQ1l/9czt3dHR06dMDevXtVYezixYuIj4/H4sWLkZ+fj4MHD+K9997D8OHDAQDdu3fHvXv3sHLlSgwZMkR17tOnTwOA2melpKTA2NgYM2fOVNXduXNn/PPPP6rQVpnyGcsOHTrA2tpa7TMeZ2Zmpva5j47LyMhAQkICPv30U/Tt27fCMU/SunVr1e+Mv78/8vPzsXnzZlXAy8vLQ2hoKPr16wcA8PX1RW5uLhYvXowHDx7A0tISZmZm+Oyzz9C7d28AwNGjR3H69GnBwhgATJo0CceOHcPcuXMxYcIEHDlyBCtXrqzS7c1XrlzBu+++iz59+uDzzz9nECOqR3ibIhHRY1q1aoXFixfD0tISs2bN0pjxOX/+PHr27KkKYgCgr6+P/v3749q1a8jLy8P777+PM2fOwNjYGCUlJSgpKUHnzp1hY2ODiIgIAGV/AevatSv09P77n2JfX18AUAsA3bp1g1KpxJUrV5CUlIS0tDT4+fmp9tvb26Np06ZwcXFBgwYNVNu7d++OlJQUJCcna1zj2rVrYW1tjbfeeusZu1V26523tzd69uyp2nbnzh0kJyfjxRdfVF1/SUkJfHx8YGZmhj///FPtHI+OKSkp0boGpVKJkpIStdvZqurChQuQyWRqtzLq6elhx44dmDx5ssb4HTt2IC0tTRWGtTV48GBERkbi3r17AIDdu3fD1dUVHTp0gImJCSIiIjB69GgoFAqUlJSgtLQU/fv3R0ZGBuLi4p54bnt7e2zZsgWdOnVCYmIi/vzzT2zduhUXL16EXC5/4rHJycnQ09NT+72uDmtrazRp0gT/+9//EBMTg/z8fK3+2QwcOFDt5z59+qC4uBiXLl2CgYEBNmzYgH79+iElJQV//fUXduzYgePHjwMA5HI5DAwMsHTpUvTu3RslJSWIj4/H5cuX4e7urnbe8v6Wf1VUnza/VzKZDF988QUSExPxySefYNCgQWphtDJJSUkYP348lEolZs+erfa/B0RU93FmjIjoMWZmZhg0aBDc3Nzw1ltvYcqUKfj5559Vs0tZWVlqoadcgwYNoFQqkZubC1NTUxgaGgIAevfurfqLNwBkZ2cDANLS0lQzEOXKf7ayslJts7S0VI1PS0tTG1fOyspK4y9xFhYWAIAHDx6gUaNGqu3x8fHYvHkzvv/+eyQlJVWxKxU7f/48jhw5gt9++03ttr3MzEwAwNy5czF37lyN48qfjSvXunXrZ6ojIiJCdQ5ra2u0adMGkydPVpu9qUxmZmaF/ats7MqVKxESElLt0NKvXz8sXLgQe/fuxdixY3Hw4EFMmDBBtV9PTw8GBgbYtWsXZs2apXbso88OVua3337D8uXLcf/+fVhZWaFly5YwMjJ66nGJiYmws7ODTCbT/qIeIZFIsGbNGsyePRsDBgzQ+tmnx2eSymcWy6/99OnTWLhwIe7cuQNTU1N4enrCxMQEADQ+a/DgwYiOjoZEItF4/mvt2rVYu3btE2vR9veqZcuW8PDwwLVr19T+48STJCYmonv37ggPD8eaNWs0nk0
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Список для хранения коэффициентов силуэта\n",
"silhouette_scores = []\n",
"\n",
"# Вычисление коэффициентов силуэта для каждого количества кластеров\n",
"for i in clusters_range[1:]: \n",
" kmeans = KMeans(n_clusters=i, random_state=random_state)\n",
" labels = kmeans.fit_predict(data_scaled)\n",
" score = silhouette_score(data_scaled, labels)\n",
" silhouette_scores.append(score)\n",
"\n",
"# Построение диаграммы значений силуэта\n",
"plt.figure(figsize=(10, 6))\n",
"plt.plot(clusters_range[1:], silhouette_scores, marker='o')\n",
"plt.title('Коэффициенты силуэта для разных k')\n",
"plt.xlabel('Количество кластеров')\n",
"plt.ylabel('Коэффициент силуэта')\n",
"plt.grid(True)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Средний коэффициент силуэта: 0.678\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA1MAAAJzCAYAAADqY0keAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3xUVfrH8c+ZmkaTXqQrRXpHBQRsa13Q/SkqiGUBBde6llXXCpbFCiggqyisyloX27piY7EhKsgKgoAgvfckU8/vj0sGhiSQGZJMQr7v14sXzD137n1ymMB9cs55jrHWWkRERERERCQhrlQHICIiIiIiUh4pmRIREREREUmCkikREREREZEkKJkSERERERFJgpIpERERERGRJCiZEhERERERSYKSKRERERERkSQomRIREREREUmCkikREREREZEkKJkSkaQNHjyYFi1axP3q0qULQ4YMYe7cuakOT0SOci1atGDcuHH5ji9dupSePXvSp08fVq5cWej7x40bR4sWLWjbti179uwp8JxXXnmFFi1a0K9fv+IKW0SOIkqmROSItG7dmhkzZjBjxgxefvllHn74YbxeL1dddRW//PJLqsMTkQrml19+YejQoaSnpzN9+nQaN2582PeEw2E++eSTAtvef//9Yo5QRI4mSqZE5IhkZWXRoUMHOnToQOfOnTn11FMZN24cLpeLN998M9XhiUgFsnz5ci6//HIyMzOZPn06xx57bJHe16lTJz744IN8xzdu3Mi8efNo1apVcYcqIkcJJVMiUuzS09Px+/0YY2LHBg8ezODBg+POe+yxx2jRokVc0jV9+nT69+9Px44dueyyy1i6dCkA//jHP2jRogW//vpr3DX+9a9/0apVK9avXw/ArFmzuOSSS+jYsSNt2rThzDPP5B//+Efce26//fZ80xPzfq1ZsyZ2zsHTel599dV804ref/99zjrrLDp06MDAgQOZN29e3HsOF88333xDixYt+Oabb+Led3B/FaX/gsEgjzzyCH369KFVq1ZxX9ehEtuDrz169Gjatm3L7Nmzgf1ToQr6dWDcRen7TZs2cdttt9GzZ8/Y3/EPP/wAQL9+/Q779zJv3jwuu+wy2rdvT7du3bjtttvYtm1b7PpvvvkmLVq0YMGCBQwYMIB27dpx7rnn8u9//zsujt27d/PQQw9x6qmn0rZtW8455xxef/31uHMOjKdly5Z07dqV6667ju3btxfalwArVqxg1KhRdOvWja5duzJ8+HCWL19e6PmH6t8D/95WrlzJn/70J0466SQ6dOjA4MGD+e6772Lta9asib1v5syZcff49NNPY20Hev/99xk4cCAdO3bkpJNO4q9//Ss7d+7MF9uBCvos9uvXj9tvv73Q1wfLi/XAr+/777/noosuom3btpx00kk88MAD5ObmFnqNgy1fvpwhQ4ZQqVIlpk+fTr169Yr83rPOOos5c+bkm+r373//myZNmtCyZct875k1axYDBw6Mxfvggw+SnZ2d75yifP9/9dVXXHnllbRv356TTjqJv/3tb0Qikdh5X3zxBf/3f/9Hx44d6dq1K9dcc80hP1MiUnqUTInIEbHWEg6HCYfDhEIhNm/ezGOPPUYwGOSCCy4o9H2//fYbU6dOjTv2n//8hwceeICzzz6bCRMmEIlEGDFiBMFgkHPPPRe/38+//vWvuPe8/fbb9OzZk7p16/LZZ58xcuRITjjhBJ555hnGjRvHsccey/3338+CBQvi3lezZs3Y9MQZM2ZwzTXXHPLr3LlzJ08++WTcsR9//JFbbrmFDh068Oyzz1K3bl1GjBjBli1bABKKJ1EF9d9zzz3Hiy++yOWXX86LL77IjBkzGD9+fELX/fHHH3nllVd48skn6dixY1zbgf3117/+Na6tKF/r3r17GTRoEN988w1//vOfGT9+PH6/nyuvvJKVK1cyfvz4uJivueaa2P1q1arFt99+y9ChQ0lLS+PJJ5/kL3/5C3PnzmXIkCH5HrqHDx9O//79GT9+PE2aNOGGG27g888/ByA3N5dLLrmEd955h6uvvppnnnmGzp07c+eddzJx4sS46/Tp04cZM2Ywbdo0br75Zr744gtGjx5daP9t3LiRiy66iJUrV3Lvvffyt7/9jS1btnD55ZezY8eOQ/b9gf178N/bsmXLGDhwIGvWrOGuu+5i7NixGGO4/PLL861PzMzMzDdl7f3338fliv8v/5lnnuGmm26iQ4cOPP3004wcOZIPP/yQwYMHJ5TEFIf169dz1VVXUa1aNcaPH8+f/vQn/vWvf3HrrbcW6f0rVqzg8ssvJysri+nTp1O7du2E7n/GGWcQiUQK7Lezzz473/nvvPMOI0eOpGnTpkyYMIFRo0Yxc+ZMrr32Wqy1QGLf/7fccgudO3dm4sSJnHPOOUyZMoXXXnsNgNWrV3PttdfSpk0bnn32WUaPHs2vv/7KsGHDiEajCX2dIlL8PKkOQETKt2+//ZYTTjgh3/GbbrqJZs2aFfq+MWPGcNxxx/HTTz/Fjm3bto1LLrmEm266CXBGWvJ+qt+qVStOO+00Zs6cyfXXX48xhg0bNvD111/zt7/9DXAeOAcMGMCdd94Zu2bHjh3p3r0733zzDe3bt48d9/l8dOjQIfZ6xYoVh/w6n376aerVqxc3KrFhwwbOOOMMHnzwQVwuFzVq1OCcc85h/vz5nHrqqQnFk6iC+u/HH3+kZcuWXHnllbFjeSM6RZU3Mti/f/98bQf2VyAQiGsrytf61ltvsXbtWt56663YtKlOnTrx+9//nm+//ZY//OEPcTE3bNgw7p6PPfYYTZo0YdKkSbjdbgDat2/P2WefzRtvvMGll14aO3fw4MGMHDkSgF69ejFgwAAmTJhAnz59ePPNN1m6dCmvvvpqLGHs1asX4XCYZ555hosvvpiqVasCcMwxx8Ri6Nq1K19++WVcnx9s6tSpBINBXnjhBWrWrAlAy5YtGTRoEAsWLKBPnz6FvvfAr/Xgv7fx48fj8/l46aWXyMrKAuCUU07hnHPO4dFHH40bVevduzf//e9/CQaD+Hw+AoEAH3/8MV27do2NJO7cuZNnn32W//u//4tLjI8//nguvfTSfP1Z0p577jmqVavGhAkTYn+3LpeLu+66iyVLluQbHTvQypUrGTJkCFu2bCEUCiWVYNSoUYOuXbvywQcfcN555wGwdu1aFixYwKOPPsqzzz4bO9day9ixY+nVqxdjx46NHW/cuDFDhw7l888/55RTTkno+/8Pf/hD7PPas2dPZs2axWeffcbFF1/Mjz/+SG5uLsOHD48liXXq1OHjjz8mOzs79nkQkdRQMiUiR+SEE07gvvvuA5yHjF27djF79myeeOIJsrOzufHGG/O9Z/bs2Xz55Zc899xzDBkyJHb84osvBiAajZKdnc1//vMf0tLSqF+/PgAXXngh7777LvPmzaNr1668/fbbZGZmctpppwFw9dVXA84IyK+//spvv/3GwoULAScxS9bSpUtjoxN5MQKcfvrpnH766Vhryc7O5oMPPsDlctGkSZMSjaew/mvbti2TJ0/mww8/pEePHmRmZhb5wdJayw8//MD777+fb8SrKIrytX733Xc0aNAgbv1Jeno6H3744WGvn5OTw4IFC7jqqqtio6EAxx57LM2aNeOLL76Ie/gfMGBA7M/GGE477TTGjRtHbm4uc+fOpX79+vlG3s477zxef/31uKQn717RaJSff/6Z7777jhNPPLHQOL/77js6dOgQS6TAefD99NNPD/s1HsrcuXPp27dv3IOzx+OJjeLu3bs3drxHjx7Mnj2bb775hl69ejF79myysrLo0qVLLJmaP38+wWCQc845J+4+Xbp0oX79+sydO/eIk6m8vnO5XPlGxfJEo1HC4TDz5s3j5JNPjiVS4CSF4PTpoZKpd999lzZt2vDEE09w5ZVX8uc//5mpU6fG3TMSicRGjMD5TBx4L3Cm+j344IPs2bOHrKws3nvvPU444QQaNWoUd96KFSvYsGEDw4cPj30OwUm2s7Ky+OKLLzjllFMS+v4/+LNYp06d2JTB9u3
"text/plain": [
"<Figure size 1000x700 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Добавляем индекс строки как дополнительный признак\n",
"data_scaled_with_index = np.hstack((data_scaled, np.arange(data_scaled.shape[0]).reshape(-1, 1)))\n",
"\n",
"# ========================\n",
"# Применение K-Means\n",
"# ========================\n",
"kmeans = KMeans(n_clusters=3, random_state=42) \n",
"df_clusters = kmeans.fit_predict(data_scaled)\n",
"\n",
"# ========================\n",
"# Оценка качества кластеризации\n",
"# ========================\n",
"silhouette_avg = silhouette_score(data_scaled, df_clusters)\n",
"print(f'Средний коэффициент силуэта: {silhouette_avg:.3f}')\n",
"\n",
"# ========================\n",
"# Визуализация кластеров\n",
"# ========================\n",
"pca = PCA(n_components=2)\n",
"df_pca = pca.fit_transform(data_scaled_with_index)\n",
"\n",
"plt.figure(figsize=(10, 7))\n",
"sns.scatterplot(x=df_pca[:, 0], y=df_pca[:, 1], hue=df_clusters, palette='viridis', alpha=0.7)\n",
"plt.title('Визуализация кластеров с помощью K-Means')\n",
"plt.xlabel('Первая компонента PCA')\n",
"plt.ylabel('Вторая компонента PCA')\n",
"plt.legend(title='Кластер', loc='upper right')\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Средний коэффициент силуэта, равный 0.678, указывает на хорошую кластеризацию. \n",
"\n",
"Средний коэффициент силуэта (silhouette score) указывает на качество кластеризации, измеряя, насколько хорошо точки внутри одного кластера близки друг к другу по сравнению с точками из других кластеров. Значения коэффициента силуэта находятся в диапазоне от -1 до 1:\n",
"\n",
"1: Указывает на идеально плотные и четко разделенные кластеры. \n",
"0: Указывает на перекрытие кластеров или слабую структуру кластеризации. \n",
"Отрицательные значения: Указывают, что точки в кластере расположены ближе к другому кластеру, чем к своему."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Средний коэффициент силуэта (агломеративная кластеризация): 0.724\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA1MAAAJzCAYAAADqY0keAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3iTVf8G8Du7TdK9d0sLpYWW1UJBNjheURT0fRWZKgIKKCgvyk9REEFEQGUvlSWCCiIovig4EET2phRoaUv33mn27w9sbGgLTWhJS+/PdXFBn/MkzzenIcmdc57zCIxGoxFERERERERkEaGtCyAiIiIiImqOGKaIiIiIiIiswDBFRERERERkBYYpIiIiIiIiKzBMERERERERWYFhioiIiIiIyAoMU0RERERERFZgmCIiIiIiIrICwxQREREREZEVxLYugKgxjBw5EkePHjXb5uDggMjISEyaNAldu3a1UWVERNRcvf/++ygvL8drr72GhIQEvPzyyzh8+DBEIpGtSyMiG2GYontWZGQk3nnnHQCAXq9HYWEhvvzySzz//PPYsWMHWrdubeMKiYioORkzZgxGjBiBuLg4SCQSzJw5k0GKqIUTGI1Go62LIGpoI0eOBABs2rTJbHtFRQW6d++OZ555Bq+//rotSiMiomZMq9UiNTUVbm5ucHZ2tnU5RGRjPGeKWhR7e3vIZDIIBALTtpEjR5rCV5VFixYhPDwcO3bsMG3bvHkzBgwYgE6dOmHEiBG4fPkyAOCLL75AeHg4rl27ZnYf3333HSIiIpCZmQkA2LdvH5555hl06tQJ7du3x0MPPYQvvvjC7DZvvPEGwsPDa/2TlpZm2qd///5mt9u6dSvCw8OxdOlS07Y9e/bg4YcfRseOHTF06FAcP37c7Da3q+fIkSMIDw/HkSNHzG53c3/Vp/80Gg0++OAD9OnTBxEREWaPq3of3+zm+547dy6ioqJw4MABAMDSpUvr7K/qdden73NycvD666+je/fupt/xqVOnAAD9+/e/7e/l+PHjGDFiBDp06ICuXbvi9ddfR0FBgen+d+zYgfDwcJw5cwZDhgxBdHQ0Hn30Ufzvf/8zq6O0tBTvv/8+Bg4ciKioKDzyyCP45ptvzPapXk/btm0RGxuLyZMno7CwsM6+BICkpCTTNNfY2FiMHz8eiYmJde5/q/6t/ntLTk7Gyy+/jPvuuw8dO3bEyJEjceLECVN7Wlqa6Xa7du0yO8avv/5qaqtuz549GDp0KDp16oT77rsPb7/9NoqLi2vUVl1tz8X+/fvjjTfeqPPnm1XVWv3xnTx5Ek899RSioqJw3333Yc6cOaisrKzzPqruZ/r06ejZsyfatWuH7t27Y/r06Wa/o9qeV2lpafV+Xufk5GDGjBno06cPoqOj8eSTT2L//v1mdVTdbuXKlWbbL1++XOM5DDTc8/hWj7/68+HmP1WvbfV5XamqpepP+/bt8eCDD5o9x2p7nlT1S/XXy/r25dKlSyGRSBAaGgonJyc8/fTTNfrwVscqLy/HyJEjERkZCbVabXqsdfVHFb1ejzVr1uCRRx5BdHQ0OnbsiKeffhp//fWX2bFOnz6N5557Dp07d0ZcXBxeffVVZGdn16vPAeDrr7/GoEGD0L59e/Tt2xdLly6FXq83tb/xxhsYOXIkvvnmG/Tr1w+dOnXC6NGjcenSJdM+Vb+X6n1y5coVtGvXzux3Gh8fj+HDh6NTp04YOHAgtm7davZYLl26hEmTJiEuLg7t2rVDr1698N5775n937v59wjU/J3X9hz4448/EB4ebnotqO3/vVqtxoABA2p9/hBVxzBF9yyj0QidTgedTgetVovc3FwsWrQIGo0GTzzxRJ23S01Nxfr16822/fTTT5gzZw4GDRqE5cuXQ6/XY8KECdBoNHj00Uchk8nw3Xffmd1m586d6N69O3x8fPDbb79h4sSJaNeuHVasWIGlS5ciICAA7777Ls6cOWN2Ow8PD2zbts3058UXX7zl4ywuLsbHH39stu3s2bOYNm0aOnbsiJUrV8LHxwcTJkxAXl4eAFhUj6Vq67+1a9diw4YNGD16NDZs2IBt27Zh2bJlFt3v2bNn8eWXX+Ljjz9Gp06dzNqq99fbb79t1lafx1peXo5hw4bhyJEj+O9//4tly5ZBJpPhueeeQ3JyMpYtW2ZW84svvmg6nqenJ44dO4YxY8bAzs4OH3/8Mf7v//4PR48exahRo2p86B4/fjwGDBiAZcuWISQkBFOmTMHvv/8OAKisrMQzzzyD3bt3Y+zYsVixYgW6dOmCN998E6tWrTK7nz59+mDbtm3YtGkTXnvtNRw6dAhz586ts/+ys7Px1FNPITk5GbNmzcKHH36IvLw8jB49GkVFRbfs++r9e/Pv7erVqxg6dCjS0tLw1ltvYeHChRAIBBg9enSN8xYVCgV++eUXs2179uyBUGj+VrRixQq8+uqr6NixI5YsWYKJEydi7969GDly5G1DTEPLzMzE888/DxcXFyxbtgwvv/wyvvvuO0yfPr3O26hUKowaNQqJiYl455138Omnn2LUqFH44Ycf8NFHH5ntW/V7rP58qnKr53VeXh6efPJJHD9+HFOnTsXSpUvh5+eHiRMn1gis9e33hnoe3+7xe3p61nh9u/n/2M1qe12pUnXb5cuXo1WrVnj99ddrfLl1K5b0ZXXfffed6QuX+tqyZQvy8vKwYcMGSKVS0/bIyEiz3/eTTz5pdruFCxdixYoVeOqpp7Bu3TrMmTMHRUVFeOWVV6BSqQAAFy9exIgRI6BWq7FgwQLMnj0b58+fx/PPP1+vPl+9ejVmzpyJ7t27Y9WqVRg+fDjWrl2LmTNnmtUSHx+Pjz76CJMmTcKHH36IwsJCjBgxAjk5OXU+7rlz50Kn05l+VqlUeOGFF6DT6bB06VIMHjwY77zzjumLspycHAwfPhwqlQrz58/H2rVrMWjQIGzatAkbN260qM9vptVqMW/evNvut27duluGZKIqPGeK7lnHjh1Du3btamx/9dVXERoaWuft5s2bh9atW+PChQumbQUFBXjmmWfw6quvArgx0lL1rX5ERATuv/9+7Nq1C6+88goEAgGysrLw119/4cMPPwRw4wPnkCFD8Oabb5rus1OnTujWrRuOHDmCDh06mLZLpVJ07NjR9HNSUtItH+eSJUvg6+tr9o13VlYWHnzwQbz33nsQCoVwd3fHI488gtOnT2PgwIEW1WOp2vrv7NmzaNu2LZ577jnTNkvfpKpGBgcMGFCjrXp/VX3bW6U+j/Xbb79Feno6vv32W0RERAAAOnfujMcffxzHjh3Dv//9b7OaAwMDzY65aNEihISEYPXq1abzJzp06IBBgwZh+/btGD58uGnfkSNHYuLEiQCAXr16YciQIVi+fDn69OmDHTt24PLly9i6daspMPbq1Qs6nQ4rVqzA008/bZpW5OrqaqohNjYWf/75p1mf32z9+vXQaDT4/PPP4eHhAQBo27Ythg0bhjNnzqBPnz513rb6Y73597Zs2TJIpVJs3LgRSqUSANC3b1888sgjWLBggdmoWu/evfHHH39Ao9FAKpVCrVZj//79iI2NNY24FBcXY+XKlfjPf/5jFiDatGmD4cOH1+jPxrZ27Vq4uLhg+fLlpt+tUCjEW2+9hYSEhFq/tU5OToa3tzc++OADBAQEAADi4uJw5syZGgGz+u/xZrd6Xn/++ecoKCjA3r174efnB+BGMBszZgwWLFiARx55xBSWevfujf/973/IyckxhbUff/zRrN+Bhnse3+7xV3+Nq3p9i4iIgL+/f639ANT+ulKl+m19fHzwyy+/ID4+HiEhIXXen7V9WaW8vBwLFy5Eu3btbvn/rjq9Xm86bzc2NtasTalUmv2+//jjD7P2nJwcTJ061WxkRyaTYfLkyUhISEDHjh2xatUqODs747PPPoNMJgMAeHp64rXXXkNiYuIt+7y0tNQU1t566y0AQM+ePeHs7Iy33noLzz77rOk849LSUqxatQoxMTEAgOjoaAwcOBAbN27EtGnTajzuvXv34syZM2a/j/T0dER
"text/plain": [
"<Figure size 1000x700 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Добавляем индекс строки как дополнительный признак\n",
"data_scaled_with_index = np.hstack((data_scaled, np.arange(data_scaled.shape[0]).reshape(-1, 1)))\n",
"\n",
"# ========================\n",
"# Агломеративная кластеризация\n",
"# ========================\n",
"agg_cluster = AgglomerativeClustering(n_clusters=3) \n",
"labels_agg = agg_cluster.fit_predict(data_scaled)\n",
"\n",
"# ========================\n",
"# Оценка качества кластеризации\n",
"# ========================\n",
"silhouette_avg_agg = silhouette_score(data_scaled, labels_agg)\n",
"print(f'Средний коэффициент силуэта (агломеративная кластеризация): {silhouette_avg_agg:.3f}')\n",
"\n",
"# ========================\n",
"# Визуализация кластеров\n",
"# ========================\n",
"pca = PCA(n_components=2)\n",
"df_pca = pca.fit_transform(data_scaled_with_index)\n",
"\n",
"plt.figure(figsize=(10, 7))\n",
"sns.scatterplot(x=df_pca[:, 0], y=df_pca[:, 1], hue=labels_agg, palette='viridis', alpha=0.7)\n",
"plt.title('Визуализация кластеров с помощью агломеративной кластеризации')\n",
"plt.xlabel('Первая компонента PCA')\n",
"plt.ylabel('Вторая компонента PCA')\n",
"plt.legend(title='Кластер', loc='upper right')\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Значение коэффициента силуэта лежит в диапазоне от -1 до 1. Ближе к 1: Хорошо сформированные, плотные кластеры, четко отделенные друг от друга. \n",
"\n",
"Ближе к 0: Кластеры пересекаются или слабо разделены, не имеют четких границ. Точки расположены одинаково близко как к своему кластеру, так и к соседним. \n",
"Ближе к -1 (Отрицательные значения): Некоторые точки скорее относятся к другим кластерам, чем к текущему (ближе к центрам других кластеров). Очень плохая кластеризация. \n",
"Ближе к 1: Все точки внутри каждого кластера плотно сгруппированы и значительно удалены от точек других кластеров. Свидетельствует о четкой и хорошо разделенной структуре данных. Единица говорит об идеальной кластеризации.\n",
"\n",
"Средний коэффициент силуэта, равный 0.724, указывает на то, что кластеры имеют хорошее разделение и четкие границы. Точки внутри каждого кластера достаточно плотно сгруппированы и значительно удалены от точек других кластеров, что свидетельствует о четкой и хорошо разделенной структуре данных."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}