1129 lines
1.5 MiB
Plaintext
1129 lines
1.5 MiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# Лабораторная работа №5\n",
|
|||
|
"\n",
|
|||
|
"*Вариант задания:* Товары Jio Mart (вариант - 23) "
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Index(['category', 'sub_category', 'href', 'items', 'price'], dtype='object')\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"import numpy as np\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"from scipy.cluster.hierarchy import dendrogram, linkage, fcluster\n",
|
|||
|
"from sklearn.cluster import KMeans\n",
|
|||
|
"from sklearn.decomposition import PCA\n",
|
|||
|
"from sklearn.preprocessing import StandardScaler\n",
|
|||
|
"from sklearn.metrics import silhouette_score\n",
|
|||
|
"\n",
|
|||
|
"df = pd.read_csv(\"..//static//csv//jio_mart_items.csv\")\n",
|
|||
|
"df = df.iloc[:15000]\n",
|
|||
|
"print(df.columns)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 2,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>category</th>\n",
|
|||
|
" <th>sub_category</th>\n",
|
|||
|
" <th>href</th>\n",
|
|||
|
" <th>items</th>\n",
|
|||
|
" <th>price</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>Groceries</td>\n",
|
|||
|
" <td>Fruits & Vegetables</td>\n",
|
|||
|
" <td>https://www.jiomart.com/c/groceries/fruits-veg...</td>\n",
|
|||
|
" <td>Fresh Dates (Pack) (Approx 450 g - 500 g)</td>\n",
|
|||
|
" <td>109.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>Groceries</td>\n",
|
|||
|
" <td>Fruits & Vegetables</td>\n",
|
|||
|
" <td>https://www.jiomart.com/c/groceries/fruits-veg...</td>\n",
|
|||
|
" <td>Tender Coconut Cling Wrapped (1 pc) (Approx 90...</td>\n",
|
|||
|
" <td>49.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>Groceries</td>\n",
|
|||
|
" <td>Fruits & Vegetables</td>\n",
|
|||
|
" <td>https://www.jiomart.com/c/groceries/fruits-veg...</td>\n",
|
|||
|
" <td>Mosambi 1 kg</td>\n",
|
|||
|
" <td>69.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>Groceries</td>\n",
|
|||
|
" <td>Fruits & Vegetables</td>\n",
|
|||
|
" <td>https://www.jiomart.com/c/groceries/fruits-veg...</td>\n",
|
|||
|
" <td>Orange Imported 1 kg</td>\n",
|
|||
|
" <td>125.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>Groceries</td>\n",
|
|||
|
" <td>Fruits & Vegetables</td>\n",
|
|||
|
" <td>https://www.jiomart.com/c/groceries/fruits-veg...</td>\n",
|
|||
|
" <td>Banana Robusta 6 pcs (Box) (Approx 800 g - 110...</td>\n",
|
|||
|
" <td>44.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" category sub_category \\\n",
|
|||
|
"0 Groceries Fruits & Vegetables \n",
|
|||
|
"1 Groceries Fruits & Vegetables \n",
|
|||
|
"2 Groceries Fruits & Vegetables \n",
|
|||
|
"3 Groceries Fruits & Vegetables \n",
|
|||
|
"4 Groceries Fruits & Vegetables \n",
|
|||
|
"\n",
|
|||
|
" href \\\n",
|
|||
|
"0 https://www.jiomart.com/c/groceries/fruits-veg... \n",
|
|||
|
"1 https://www.jiomart.com/c/groceries/fruits-veg... \n",
|
|||
|
"2 https://www.jiomart.com/c/groceries/fruits-veg... \n",
|
|||
|
"3 https://www.jiomart.com/c/groceries/fruits-veg... \n",
|
|||
|
"4 https://www.jiomart.com/c/groceries/fruits-veg... \n",
|
|||
|
"\n",
|
|||
|
" items price \n",
|
|||
|
"0 Fresh Dates (Pack) (Approx 450 g - 500 g) 109.0 \n",
|
|||
|
"1 Tender Coconut Cling Wrapped (1 pc) (Approx 90... 49.0 \n",
|
|||
|
"2 Mosambi 1 kg 69.0 \n",
|
|||
|
"3 Orange Imported 1 kg 125.0 \n",
|
|||
|
"4 Banana Robusta 6 pcs (Box) (Approx 800 g - 110... 44.0 "
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 2,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"df.head()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 3,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>price</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>count</th>\n",
|
|||
|
" <td>15000.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>mean</th>\n",
|
|||
|
" <td>373.427633</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>std</th>\n",
|
|||
|
" <td>463.957949</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>min</th>\n",
|
|||
|
" <td>5.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>25%</th>\n",
|
|||
|
" <td>123.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>50%</th>\n",
|
|||
|
" <td>250.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>75%</th>\n",
|
|||
|
" <td>446.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>max</th>\n",
|
|||
|
" <td>14999.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" price\n",
|
|||
|
"count 15000.000000\n",
|
|||
|
"mean 373.427633\n",
|
|||
|
"std 463.957949\n",
|
|||
|
"min 5.000000\n",
|
|||
|
"25% 123.000000\n",
|
|||
|
"50% 250.000000\n",
|
|||
|
"75% 446.000000\n",
|
|||
|
"max 14999.000000"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 3,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"df.describe()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 4,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"category 0\n",
|
|||
|
"sub_category 0\n",
|
|||
|
"href 0\n",
|
|||
|
"items 0\n",
|
|||
|
"price 0\n",
|
|||
|
"dtype: int64\n",
|
|||
|
"category False\n",
|
|||
|
"sub_category False\n",
|
|||
|
"href False\n",
|
|||
|
"items False\n",
|
|||
|
"price False\n",
|
|||
|
"dtype: bool\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Процент пропущенных значений признаков\n",
|
|||
|
"for i in df.columns:\n",
|
|||
|
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
|
|||
|
" if null_rate > 0:\n",
|
|||
|
" print(f'{i} Процент пустых значений: %{null_rate:.2f}')\n",
|
|||
|
"\n",
|
|||
|
"print(df.isnull().sum())\n",
|
|||
|
"\n",
|
|||
|
"print(df.isnull().any())"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 5,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"category object\n",
|
|||
|
"sub_category object\n",
|
|||
|
"href object\n",
|
|||
|
"items object\n",
|
|||
|
"price float64\n",
|
|||
|
"dtype: object"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 5,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Проверка типов столбцов\n",
|
|||
|
"df.dtypes"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# Атрибуты \n",
|
|||
|
"\n",
|
|||
|
"category: Категория товара.\n",
|
|||
|
"\n",
|
|||
|
"sub_category: Подкатегория товара.\n",
|
|||
|
"\n",
|
|||
|
"href: Ссылка на товар.\n",
|
|||
|
"\n",
|
|||
|
"items: Название товара.\n",
|
|||
|
"\n",
|
|||
|
"price: Цена товара.\n",
|
|||
|
"\n",
|
|||
|
"# Цель:\n",
|
|||
|
"Оптимизация стратегий ценообразования и маркетинга для розничных компаний, стремящихся привлечь покупателей с различными предпочтениями.\n",
|
|||
|
"Кластеризация товаров на основе их характеристик (категория, подкатегория, цена) для выявления групп с похожими профилями."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Очистка данных"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Цель: Упростить набор данных, удалив несущественные столбцы, чтобы сосредоточиться на ключевых атрибутах, которые будут использоваться для кластеризации и анализа.\n",
|
|||
|
"\n",
|
|||
|
"Столбцы href и items несущественны для анализа, они не содержат ценной информации для решения задачи."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 6,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
" category sub_category price\n",
|
|||
|
"0 Groceries Fruits & Vegetables 109.0\n",
|
|||
|
"1 Groceries Fruits & Vegetables 49.0\n",
|
|||
|
"2 Groceries Fruits & Vegetables 69.0\n",
|
|||
|
"3 Groceries Fruits & Vegetables 125.0\n",
|
|||
|
"4 Groceries Fruits & Vegetables 44.0\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Удаление несущественных столбцов\n",
|
|||
|
"columns_to_drop = ['href', 'items']\n",
|
|||
|
"df_cleaned = df.drop(columns=columns_to_drop)\n",
|
|||
|
"\n",
|
|||
|
"print(df_cleaned.head()) # Вывод очищенного DataFrame"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Визуализация парных взаимосвязей\n",
|
|||
|
"Визуализировать ключевые атрибуты миллиардеров для выявления закономерностей и связей между ними."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 7,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAABjAAAA1TCAYAAABQAC/wAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzde5zWdZ3//+cMDGeYgOQUi5wSUFAoQWyjjMzKzHLZbVsFy7Is26+ZmvVTK9NvtqWmedo8prK6aumaq9W6tbVbbSK4qRjDKuKoBIgHHM4zDFy/P64vYxPkjIBeHy7v99uNG8z7/bk+vqB/YB593p+aUqlUCgAAAAAAQIHUVnoAAAAAAACAPyVgAAAAAAAAhSNgAAAAAAAAhSNgAAAAAAAAhSNgAAAAAAAAhSNgAAAAAAAAhSNgAAAAAAAAhSNgAAAAAAAAhSNgAAAAO6VUKlV6BAAAoIoJGAAA8DqxcOHCfPGLX8whhxyS/fffP4ceemi+8pWv5Omnn37F93rsscfyd3/3d6/ClNVh3rx5GTduXLsf48ePz1ve8pZ89KMfzX/8x390eI8vf/nLmTlz5mswLQAAFFPXSg8AAAC8+m666aacd955Oeigg3Lqqadm0KBBefLJJ3Pttdfm3nvvzQ033JDx48d3+n4//elP87vf/e5VnLg6fPWrX81+++2XpPzESlNTU6677rqceOKJufLKK/POd77zz372xBNPzLHHHvtajQoAAIUjYAAAQJV74IEH8o1vfCPHHHNMzjzzzLb1gw46KIceemg+/OEP54wzzsgdd9xRwSmr09ixYzN58uR2awceeGAOOeSQ3HjjjS8bMEaMGPEqTwcAAMXmCCkAAKhy1157bfr27ZtTTjllu70BAwbky1/+ct797ndnw4YNSZJNmzblwgsvzGGHHZaJEyfmLW95S4477rg0NDQkSS699NJcdtllSZJx48bl0ksvTZJs3bo1V111Vd7znvdk4sSJee9735u5c+fucJ53v/vd2X///duOUxo3blzmzZvXds3ChQvzyU9+MgcddFDe8pa35DOf+Uwee+yxtv1tRzTdcsstede73pW3vOUt+dnPfpZx48bl17/+dbv/3oIFCzJu3Lg88MAD283yr//6rxk3blweffTRduvb7rVo0aIkyQ033JD3ve99mTRpUmbMmJGzzz4769at6/gPfwf69OmTUaNGZfny5X/29/Kb3/xmuyOkSqVSrr/++rz//e/P/vvvn/e85z259tpr272LZMGCBZk9e3YOOOCATJs2LV/60pfywgsv7NScAABQaZ7AAACAKlYqlfLrX/86M2fOTM+ePXd4zeGHH97u69NPPz0LFizIKaeckhEjRuTJJ5/Md7/73Zx66qm555578jd/8zdZuXJlfvjDH+bWW2/NkCFDkiRnn3127rjjjpxwwgmZMmVK5s+fn/POOy9r1qzJ5z73uSTJZZddlssvvzyf/OQnM3369PzqV7/KySef3O6/f9999+X444/PQQcdlPPOOy/Nzc258sor89GPfjS33XZbxowZ03btZZddlrPOOiubNm3K2972tgwaNCg/+tGP8va3v73tmjvvvDMjR47MW9/61u1+74ceemh69eqVe+65J/vss0/b+t133503v/nN2XfffXP33Xfn/PPPz5e+9KWMGzcuS5cuzbe+9a1s3Lgx3/rWt17Z/yBJWlpasmzZsuy///7t1v/49zJlypT867/+a7v9b3/727nhhhty3HHH5S//8i+zcOHCXHDBBWltbc0JJ5yQ+fPn57jjjsv06dNz8cUXp6mpKd/97ndz7LHH5oc//GF69OjximcFAIBKEjAAAKCKrV69Os3NzRk+fHinrm9pacn69etz1llntYWNadOmZd26dfmHf/iHPPfccxkyZEhbtNh2PNITTzyR2267Laeccko+/elPJ0ne/va3p6amJldeeWWOPvrodO/ePVdffXWOOeaYnHbaaW3XbNy4MbfeemvbDBdeeGH23nvvXHXVVenSpUvbde95z3tyySWX5Lvf/W7btUcffXTe9773tX191FFHZe7cuVm/fn169+6dTZs25Sc/+UnbTH+qZ8+eee9735sf//jH+cIXvpAkWb9+fX7xi1+0RZf7778/w4cPzzHHHJPa2tpMmzYtvXr1SlNTU4d/nlu3bk1ra2uSpLW1NX/4wx9yxRVX5IUXXsgxxxzT7to//b38sTVr1uTGG2/M7Nmz88UvfjFJ8ra3vS3PPvts5s+fnxNOOCEXXnhhRo0alSuvvLLtz+2AAw7IBz7wgdx+++3b/fcAAKDoHCEFAABVbNs3srds2dKp67t165Zrr702hx9+eJ555pncd999ueWWW/KLX/wiSTlw7Mh9992XUqmUmTNnprW1te3HzJkz09zcnAceeCAPPvhgNm3atN036Y844oi2X2/YsCELFy7M+9///rbZk6Rfv35517velfvvv7/dZydMmNDu61mzZmXDhg3593//9yTJv//7v2fDhg358Ic//Gd/zx/60Ify1FNP5eGHH06S/PznP09LS0uOPPLIJMn06dPzxBNP5K/+6q9y2WWXZeHChfngBz+YOXPmvNwfZZLk4x//ePbbb7/st99+OeCAA3L44Yfnt7/9bc4666y84x3veNnfyx978MEH09ramsMOO6zd+llnnZVrrrkmGzduzEMPPZR3vvOdKZVKbX/+f/EXf5ExY8bkN7/5TYezAgBA0XgCAwAAqlh9fX169+7d9r6FHdmwYUM2b96c+vr6JMmvfvWrnHfeeVm6dGl69+6d8ePHp1evXknS7n0Lf+zFF19MknzgAx/Y4f4zzzzTdv8BAwa02xs4cGDbr9euXZtSqZQ3vvGN293jjW98Y9auXdtubdtc2+y9996ZNm1a7rzzznz4wx/OnXfembe97W0ZPHjwn/vt56CDDsrgwYNzzz33ZP/9988999yTadOmtT1lcvjhh2fr1q25+eabc8UVV+TSSy/Nm970ppx22mnbHb/1p77+9a9nv/32S1KOSfX19Rk2bFhqamq2u/ZPfy9/bNuf75/+2W2zZs2abN26NVdffXWuvvrq7fa7d+/+snMCAEARCRgAAFDl3v72t2fevHlpbm7e4Teyb7vttnzrW9/KD3/4w/Tt2zef+9zncuihh+bKK6/MX/zFX6SmpiY33XRTfvWrX/3Z/0a/fv2SlF923bt37+32hw0blieeeCJJ8vzzz2f06NFte3/8kum+ffumpqYmzz333Hb3ePbZZ/OGN7yhw9/vrFmzcsYZZ+Txxx/Pb3/721xwwQUve31tbW0++MEP5u67785nPvOZ/OY3v8k555zT7pojjjgiRxxxRNauXZtf//rXufrqq/PFL34xb33rW182jowaNSqTJk3qcOaObPvzfeGFF9r92S1fvjxPPfVUJk6cmJqamnz84x/fYUT6c+8/AQCAInOEFAAAVLlPfOITefHFF3PxxRdvt/fss8/muuuuy9ixY7PffvvlkUceSXNzcz796U9nxIgRbU8KbIsX257AqK1t/0+JAw88MEn5nRuTJk1q+/HCCy/ku9/9bl588cWMHz8+ffv2bTveaZt777237de9evXKxIkT85Of/KTdsVdr167NL3/5yx2+iPtPvfe9703Pnj1z9tlnp3fv3jn00EM7/MyHPvShrFy5Mpdffnm6dOnS7qimk08+ue19GH379s373//+nHjiiWltbc2qVas6vPfusP/++6eurq7tKK9trrvuupxyyinp1atX9t133yxdurTdn/+b3/zmXHrppZk3b95rMicAAOxOnsAAAIAqN3ny5Hz+85/PxRdfnMcffzwf/vCH079//zz22GO59tpr09zc3BY39ttvv3Tt2jXnn39+PvGJT6SlpSV33HFHfvnLXyYpHzeVvPREwN13350DDjgg48aNy5FHHpmvfOUr+cMf/pCJEyfmiSeeyEUXXZThw4dn5MiR6dKlS44//vhccskl6dmzZ6ZNm5b7778///zP/5zkpShy6qmn5pOf/GQ+/elP5+ijj87mzZtz1VVXpaWlpS0kvJyePXvmAx/
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1600x4500 with 3 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import seaborn as sns\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"\n",
|
|||
|
"# Настройка стиля графиков\n",
|
|||
|
"sns.set(style=\"whitegrid\")\n",
|
|||
|
"\n",
|
|||
|
"# Создание фигуры\n",
|
|||
|
"plt.figure(figsize=(16, 45))\n",
|
|||
|
"\n",
|
|||
|
"# График 1: Категория vs Цена\n",
|
|||
|
"plt.subplot(4, 1, 1)\n",
|
|||
|
"sns.scatterplot(x=df_cleaned['category'], y=df_cleaned['price'], alpha=0.6, color='blue')\n",
|
|||
|
"plt.title('Category vs Price')\n",
|
|||
|
"plt.xlabel('Category')\n",
|
|||
|
"plt.ylabel('Price')\n",
|
|||
|
"plt.xticks(rotation=90)\n",
|
|||
|
"\n",
|
|||
|
"# График 2: Подкатегория vs Цена\n",
|
|||
|
"plt.subplot(4, 1, 2)\n",
|
|||
|
"sns.boxplot(x=df_cleaned['sub_category'], y=df_cleaned['price'], color='green')\n",
|
|||
|
"plt.title('Sub-Category vs Price')\n",
|
|||
|
"plt.xlabel('Sub-Category')\n",
|
|||
|
"plt.ylabel('Price')\n",
|
|||
|
"plt.xticks(rotation=90)\n",
|
|||
|
"\n",
|
|||
|
"# График 3: Категория vs Подкатегория\n",
|
|||
|
"plt.subplot(4, 1, 3)\n",
|
|||
|
"sns.countplot(x=df_cleaned['category'], hue=df_cleaned['sub_category'], palette='Set3')\n",
|
|||
|
"plt.title('Category vs Sub-Category')\n",
|
|||
|
"plt.xlabel('Category')\n",
|
|||
|
"plt.ylabel('Count')\n",
|
|||
|
"plt.xticks(rotation=90)\n",
|
|||
|
"\n",
|
|||
|
"# Упорядочиваем графики\n",
|
|||
|
"plt.tight_layout()\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### Стандартизация данных для кластеризации"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 8,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
" price category_Groceries sub_category_Dairy & Bakery \\\n",
|
|||
|
"0 -0.569958 1.0 0.0 \n",
|
|||
|
"1 -0.699284 1.0 0.0 \n",
|
|||
|
"2 -0.656175 1.0 0.0 \n",
|
|||
|
"3 -0.535471 1.0 0.0 \n",
|
|||
|
"4 -0.710061 1.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" sub_category_Fruits & Vegetables sub_category_Premium Fruits \\\n",
|
|||
|
"0 1.0 0.0 \n",
|
|||
|
"1 1.0 0.0 \n",
|
|||
|
"2 1.0 0.0 \n",
|
|||
|
"3 1.0 0.0 \n",
|
|||
|
"4 1.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" sub_category_Snacks & Branded Foods sub_category_Staples \n",
|
|||
|
"0 0.0 0.0 \n",
|
|||
|
"1 0.0 0.0 \n",
|
|||
|
"2 0.0 0.0 \n",
|
|||
|
"3 0.0 0.0 \n",
|
|||
|
"4 0.0 0.0 \n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from sklearn.preprocessing import OneHotEncoder, StandardScaler\n",
|
|||
|
"import pandas as pd\n",
|
|||
|
"\n",
|
|||
|
"# Выделяем числовые и категориальные признаки\n",
|
|||
|
"numerical_cols = ['price']\n",
|
|||
|
"categorical_cols = ['category', 'sub_category']\n",
|
|||
|
"\n",
|
|||
|
"# Масштабирование числовых признаков\n",
|
|||
|
"scaler = StandardScaler()\n",
|
|||
|
"df_numerical_scaled = scaler.fit_transform(df_cleaned[numerical_cols])\n",
|
|||
|
"\n",
|
|||
|
"# Кодирование категориальных признаков с помощью OneHotEncoder\n",
|
|||
|
"encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False) # sparse=False для удобства\n",
|
|||
|
"encoded_data = encoder.fit_transform(df_cleaned[categorical_cols])\n",
|
|||
|
"\n",
|
|||
|
"# Создаем новые столбцы для закодированных категориальных признаков\n",
|
|||
|
"encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_cols))\n",
|
|||
|
"\n",
|
|||
|
"# Объединяем числовые и закодированные категориальные данные\n",
|
|||
|
"df_encoded = pd.concat([pd.DataFrame(df_numerical_scaled, columns=numerical_cols), encoded_df], axis=1)\n",
|
|||
|
"\n",
|
|||
|
"# Выводим результат\n",
|
|||
|
"print(df_encoded.head())"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 9,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAABR4AAAP0CAYAAADMUCCZAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdeXxddZ0//te9TdJsbYoBStsUgVYLKCBSEBcQi0VGkRHRGVxQ+elXVBgVdETcly+oiAwqygzjhiiDGyLofPXLJigqWB0VFUHK1mBpS+iapbnpvb8/sPkSupDQE5K0z+fjwYPmnHPPead9N2le97OUarVaLQAAAAAABSqPdQEAAAAAwPZH8AgAAAAAFE7wCAAAAAAUTvAIAAAAABRO8AgAAAAAFE7wCAAAAAAUTvAIAAAAABRO8AgAAAAAFE7wCADANqvVamNdAqPMnzEAMFKCRwBgXDvxxBMzb968If89/elPzxFHHJGPfvSjWb169Savufvuu/ORj3wkL3zhC7P//vvniCOOyOmnn56//OUvW3zOv/3bv2XevHn5+Mc//pg1nXTSSTnkkEPS39+/xWte+tKX5jWveU2SZN68efn85z8/jM+2WO9973uzYMGCwY8XLFiQ9773vYU+44EHHsib3/zm3H///aP6nG2xbt26vOUtb8kBBxyQgw8+OPfcc88m11x++eWb9Nk+++yTgw8+OP/f//f/5Te/+c1jPufEE0/MiSeeOAqfwVDvfe97N6n1kf/9+Mc/LvR5/f39Ofvss3PVVVcVel8AYPtXN9YFAAA8ln333Tcf/vCHBz+uVCr505/+lPPOOy+33XZb/uu//iulUilJ8n//7//Ne97znjzlKU/JW9/61nR0dOSBBx7IxRdfnH/6p3/KhRdemOc+97lD7l+tVnPFFVfkqU99an7wgx/k3e9+d5qamrZYz/HHH59f/OIXufHGG/PCF75wk/N/+tOfcscdd+RTn/pUkuRb3/pWdttttyJ+K7bJBRdckNbW1kLv+Ytf/CI33HDDqD9nW1xxxRW5/vrr86EPfShPecpT0tHRscVrL7jgguyyyy5JHu6LBx98MF/4whfy+te/Pt/97nez9957b/G1j+zR0bbLLrvkggsu2Oy5PfbYo9BnLV++PBdffHE+8YlPFHpfAGD7J3gEAMa91tbWPOMZzxhy7OCDD053d3c+97nP5fe//32e8Yxn5L777ssZZ5yRww47LOeff34mTZo0eP1RRx2VV73qVTnjjDNy3XXXpaGhYfDcz3/+8zzwwAM577zz8trXvjY//OEP88pXvnKL9SxcuDBtbW258sorNxs8fv/7309ra2te9KIXJckmtY+Vfffdd7t6znCtWrUqSfLqV796MKDekn322WeTYHLffffNwoULc+mll+ZjH/vYFl87d+7cba51uBoaGsZNXwEAbImp1gDAhPX0pz89SfK3v/0tSXLJJZekv78/H/jAB4aEjknS1NSUM844I8cff/wm07O/973v5alPfWoOOuigPOtZz8q3vvWtrT538uTJOeaYY/LTn/4069atG3KuUqnkRz/6UV7ykpcMjpp89FTriy++OEcffXT222+/HHbYYfnIRz4yeJ/Ozs7Mmzcvl19++ZD7Pnra9IYNG3LRRRflmGOOyf77759nPOMZOeGEE/KrX/1qi3U/cgr05z//+S1O1d1Y62M94/LLL8+ZZ56ZJDnyyCMH7/3oqdZr167NJz7xibzwhS/Mfvvtl2OOOSbf/e53N6ntc5/7XD71qU/lOc95Tvbff/+88Y1v3Oy06Edav359vvCFLwz+fh511FG56KKLUq1Wkzw8/Xnj57P33ns/ringHR0d2WmnnQb77PLLL8++++6b73znO3nuc5+bQw45JHfeeecmU637+/tz/vnn58gjj8z++++fY445Jt///veH3Puaa67Jy1/+8uy333557nOfm//9v/93enp6Rlzjlgzn/tdcc01e/epX58ADD8zTn/70HH300fnmN7+Z5OF+PPLII5MkZ5555mAPbm5a+c0335x58+bl5ptv3urv0xPxeQMA44MRjwDAhHX33XcnSWbPnp0k+dnPfpZ9990306dP3+z1z372s/PsZz97yLFVq1bluuuuyzvf+c4kyXHHHZf3vOc9+dOf/pSnPe1pW3z28ccfn29+85v5yU9+kuOPP37w+I033piHHnpoiyMmf/jDH+bTn/50zjjjjMybNy933XVXPvWpT6W3t3dwavZwnHvuufmv//qvvOtd78q8efOybNmyfOELX8g73vGO/PSnP93qVPEkeeUrX5nDDjtsyLFzzjknf/nLX/LiF794WM844ogj8ta3vjUXXnhhLrjggsybN2+T5/T19eXVr351urq68va3vz2zZs3KNddck/e///158MEH85a3vGXw2q9//es56KCD8olPfCKrV6/OWWedlTPOOGOLQXCtVstb3vKW/O53v8upp56avffeOzfffHPOP//8LFmyJB//+Mfz4Q9/OF/96lfz3e9+N9/61rfypCc9adi/xxutXLkyK1euzO677z54bMOGDfnKV76Ss846KytXrsycOXM2ed273/3u3HDDDXnrW9+aAw44IDfccEPe+973pr6+Psccc0yuuuqqvPvd785LX/rSvPOd78z999+ff/u3f8udd96Zr371q485OnNgYGCTY5MmTRp83XDu/9Of/jSnnHJKXve61+Vf/uVf0tfXNziy8+lPf3r22WefXHDBBTn11FPz1re+NUcdddSIfu829/u0rZ83ADBxCB4BgHGvVqsNCVlWr16dW265JRdeeOHgKK3k4Y1O9tlnnxHd+6qrrkq1Ws0//uM/Jnl4SvbHPvaxXHbZZVvdaOZpT3ta9tlnn1x11VVDgscrrrgi8+bNy3777bfZ191yyy3p6OjIa17zmpTL5RxyyCFpbm7e7CY5W7N8+fKcdtppQ0adTZ48Of/yL/+S22+//TGn4e62225D1p382te+lt/+9re54IILBkO04TxjYxi3uSnKycOj3u64445cdtllOfDAA5Mkhx12WAYGBvLFL34xJ5xwQqZNm5YkmTp1ar74xS8Ojla977778vnPfz4rV67MTjvttMm9b7zxxvziF7/Ieeedl5e85CVJkuc+97lpbGzMZz/72bzuda/LU57ylMHPczhTk6vV6mCvrV+/Pvfcc0/OPffclMvl/PM///OQa9/ylrfkiCOO2Ox97rjjjvzkJz/J+973vrz+9a9P8nDwff/99+fmm2/OS17ykpx77rk57LDDcu655w6+bo899sgb3vCG3HDDDVu8d5Lcf//9mw3G3/Wud+XNb35zarXasO5/55135rjjjsv73//+wWsOPPDAPOtZz8rNN9+cAw44YPDv1O677/64ptE/8vdpuHUBANsHwSMAMO79+te/3iRkKZfLec5znpOPfexjgyOkJk2alA0bNozo3t/73vfyrGc9Kw0NDVmzZk2Sh6f9/vCHP8wZZ5yx1U1Sjj/++Jx99tlZtmxZpk+fnlWrVuX666/Pe97zni2+5tBDD823vvWtvPzlL88LX/jCPP/5z89LX/rSEY/y+sxnPpMkeeihh3LXXXfl3nvvzfXXX58kW91te3N+9rOf5Zxzzsnb3va2IWtWFvGMW265JbNmzRoMHTc69thj893vfje///3v8/znPz9Jst9++w2ZIr8xMOzt7d1s8HjLLbekrq4uRx999Cb3/uxnP5tbbrklT3nKU4ZV50YLFy7c5NisWbPy6U9/epMRnVsLuTfugv3oEYIbp30vXrw4DzzwQE4++eQhofrBBx+c1tbW3HTTTVsN4HbZZZdceOGFmxzf+Ht21113Dev+b3rTm5Ik3d3dufvuu3Pffffl1ltvTTLyPtqSR/4+DbcuAGD7IHgEAMa9pz3tafnoRz+aJCmVSpk8eXJmzJixSSg4c+bMwXX4NqdSqWT16tXZeeedkyR//vOfc9tttyV5OPh4tCuvvDKvfvWrt3i/l770pTnnnHPy3//93znppJPyox/9KKVSKccee+wWX/PiF7841Wo1l156ab74xS/
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1600x1200 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from sklearn.decomposition import PCA\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"\n",
|
|||
|
"# Применение PCA ТОЛЬКО к числовым данным\n",
|
|||
|
"pca = PCA(n_components=1)\n",
|
|||
|
"kc_pca = pca.fit_transform(df_numerical_scaled)\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация\n",
|
|||
|
"plt.figure(figsize=(16, 12))\n",
|
|||
|
"plt.scatter(range(len(kc_pca)), kc_pca, alpha=0.6)\n",
|
|||
|
"plt.title(\"PCA Visualization of Price Feature\")\n",
|
|||
|
"plt.xlabel(\"Sample Index\")\n",
|
|||
|
"plt.ylabel(\"Principal Component 1\")\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### Агломеративная (иерархическая) кластеризация"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAABSoAAAP5CAYAAAAR1hEFAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACd5klEQVR4nOzdd5hV9b3/7ffgDAKKDRUVKxq7IBbUkyiKxngieoImHmONJTbQxHIssXdjsCCoEY1RFGPDemKisURzEkWxJ2qMNYICKoIVGIb9/MHD/jEwUobRL8J9X5eXM2vt8tmFDfOaVWoqlUolAAAAAAAFtSo9AAAAAACAUAkAAAAAFCdUAgAAAADFCZUAAAAAQHFCJQAAAABQnFAJAAAAABQnVAIAAAAAxQmVAAAAAEBxQiUAAAAAUJxQCUCLOfHEE7POOus0+d+JJ55YejxgOuPHj8+mm26aF154IePHj8/hhx+ea6+9tvRYzAdeeOGFfO9738ukSZNKjzJfeOihh7Lbbrvl3XffzYgRI9KzZ8/8/e9/Lz3WfG/EiBFZZ511MmDAgNxxxx1ZZ511MmzYsHm+3Wm32VJmNec+++yT++67r8XuC4DZqy09AAALluWWWy4DBw5stKxv376FpgG+zJJLLpkDDjgge+yxRyqVStZZZ5388pe/LD0WhU2cODEnnHBC/ud//ietW7cuPc58oUePHrnpppuy3XbbJUl69+6dDTfcsPBU87+amprq/6f/el7dcsstWWGFFeb5dqaZ1Zy/+MUvctBBB2WLLbZIhw4dWuw+AfhyQiUALaahoSHt2rXLxhtv3Gi5H3Zh/tS3b9/sueee+fjjj7PaaqtlkUUWKT0Shd10002pra3NDjvsUHqU+UZtbW1+85vf5J133skiiyySlVZaqfRI3wjLLbdcamtr07Fjx2pYXHHFFef5dmf8N8a8mtWc66+/frp06ZIrr7wyp5xySoveLwBNs+s3AC1m8uTJadOmzRxddvjw4dlnn33StWvXdO/ePSeccELGjh1bXT9t96sRI0Y0ul7Pnj0b7UZeX1//pbubz3hbzz//fHr37p0uXbpkl112yR//+MdGt/3JJ5/k/PPPzw477JCNNtoovXr1yu233z7T/c94PyNGjMi+++6bE088Mb/+9a/zH//xH9l0001zxBFHZOTIkY2u/+CDD2avvfZKt27dsuGGG2annXbKkCFDquuHDRtWvd2nn3660XVvvPHGrLPOOunZs+dM88z4A9T48eOz4YYbzrSr3ezu/8vcdttt2W233bLxxhunS5cu+a//+q/84Q9/mOk5bmp3/y97ffbdd99G93Hfffdlt912S7du3fLtb387p512WsaPH19dP2DAgKyzzjrp1q3bTLukHnXUUTMdYmDixIm58MIL06NHj2y44YbZZZddZtqFr2fPnrnkkkty3nnnZfPNN88WW2yR448/PuPGjZvjxz+rQx7ccccd1dd0+tfhww8/zGabbdbka7nOOutk3XXXzeabb54jjzwyH330UfUyTe3yOO15ac5zmSTLLrtsOnfunL/97W+zPUzDjPf1+9//PptvvnkuuuiiJI3fvzP+N/3cr7zySvr27Zstt9wyG2ywQbbeeuucc845mTBhQvUykyZNyqWXXprtt98+Xbp0Sa9evXLnnXfO0XOeJO+++26OOeaYdO/ePV27ds3++++fl156qXr703b3/P3vf5/DDjssXbt2zbbbbpvLL788U6ZMafS6zPicHHPMMY1e00qlkv79+2frrbfOpptumsMOOyzvvfde9fINDQ0ZNGhQevXqlS5dumTjjTfOnnvumSeeeGKWr2My82s+4/eVSiV77rlno8/LE088sdF7K0luvvnm2e4yO2nSpPz2t79Nr169qsv23Xffmf6szviebmr2v/zlLzO9nz799NOcffbZ2XrrrbPxxhtn9913z5///OeZbnd275+JEyfm8ssvz0477ZSNNtooO+64YwYNGtToddt3330bXX+TTTbJgQcemHfeeWeub2fa419llVWy0kor5aKLLmr0XmvKjPc/498ZydTXad99983tt9+e7bbbLt26dcv++++fV155pXo7Tf1d+K9//SsbbLBBo9fl5Zdfzt57751u3bplhx12yM033/ylr1dTjy1Jxo4dmzPPPDPbbbddNtxww3Tv3j19+vRpdN8zXufcc8/NRhttlMcee2ym56B169bp3Llz1lprray77rpp165dVl555SRz9tl74oknZv/998/pp5+eTTbZJN///vfT0NAw0/thzJgxOeGEE7LVVlulW7du2WefffLss89W10+ZMiWDBg3Kd7/73Wy44Yb53ve+lxtuuGGO5kySXXbZJbfffnujf6MA8NWxRSUALeaLL77IkksuOdvLPfXUUznggAOy5ZZb5tJLL8348ePTv3//7Lfffrn99tvnOHYmU3/QTJIrr7wyyyyzTJKpUWnGwJgkhx56aPbZZ58cffTRuf322/Pzn/88V111VXr06JEJEyZkr732yocffpijjjoqnTp1yoMPPpiTTz45H3zwQQ477LDq7fTo0SNHHHFE9fvll18+ydTjmC299NI55ZRTMmXKlFx00UXZd9998/vf/z5t27bNn//85/Tp0yf77bdfjjzyyEyYMCE33XRTzjrrrGy44Ybp2rVr9TYXW2yxPPzww9l0002ry+677760ajXz7xgXW2yx/PnPf06lUqnurvbAAw+koaGh0eXm5v6nN2TIkJxzzjk58sgjs+mmm2b8+PG5+uqrc9xxx6Vbt26NdsEbOHBglltuuSSpvh5J8sMf/jA/+tGPqt+feeaZje7jiiuuyGWXXZa99torRx99dN555530798/zz33XG699dZG74mampo8/vjj6dGjR5Lks88+y6OPPtroualUKunTp0+eeeaZHHXUUVlzzTXzpz/9KUcffXQmTZqUH/zgB9XL3nTTTVlttdVy/vnnZ+zYsbnooovy9ttv5+abb05NTc1sH/8RRxyRPffcM8nULRTXX3/96vtj1VVXzb/+9a+ZntOLLroon3zySZZYYolGy6e9t+rr6/P666/nwgsvzLnnnpt+/fo1+do0ZW6ey2nq6+tz3nnnzfF9JMmECRNy1lln5eCDD84uu+zSaN1pp52WDTbYoPr9f//3f1e/HjNmTPbee+9svPHGueCCC9K6des89thj+e1vf5vll18+hxxySJLkuOOOy6OPPprDDz88Xbt2zaOPPpoTTzwxdXV1s33Ox44dmz333DNt27bNqaeemrZt2+b666/P3nvvndtvvz1rrrlmdZ4zzjgjPXr0yIABA/L0009n4MCB+fzzz/M///M/TT7u4cOH5/e//32jZdddd12uuuqqHH/88VljjTVywQUX5Gc/+1luvfXWJEm/fv3yu9/9Lscee2zWWWedjB49Opdffnl+9rOf5c9//nPatm07V8/99O6+++5GUaYp48ePz6WXXjrb2xo2bFhGjx6dHXfcsdnzJE2/nxoaGnLggQfmrbfeylFHHZXOnTvnzjvvTJ8+fXL99ddns802q152Vu+fSqWSww47LM8991z69u2bddddN8OGDcull16ad955J2effXb1suuvv35OP/30TJ48OSNGjMhFF12U448/Pr/73e/m6nam9+9//zvXXXfdHD0P0+5/mj//+c+58sorG13m5ZdfzhtvvJFjjjkmSy65ZC677LLqcRGn/d0yo3PPPTeTJ0+ufv/FF1/kpz/9aTp16pQBAwbkmWeeyemnn56VVlop22yzzRzNWqlUcuihh2b8+PE57rjjsuyyy+af//xnLr300px++un5zW9+M9N1Xnjhhfzud79L//79061btyZv9957761+PeP7dHafvcnUP2+LLrpoLr/88nz++eczbfX92Wef5cc//nEaGhryP//zP+nYsWOuvfbaHHjggbnzzjuz+uqr54wzzsgdd9yRQw89NN26dctTTz2V8847Lx9//HH69Okz2zl79uyZhoaG/OlPf2r0XgTgqyFUAtBixo0b96U/WE3voosuyhprrJGrrrqq+kNH165
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1600x1200 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"[1 1 1 ... 1 2 1]\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Построение дендрограммы (только для числовых данных)\n",
|
|||
|
"linkage_matrix = linkage(df_numerical_scaled, method='ward')\n",
|
|||
|
"\n",
|
|||
|
"plt.figure(figsize=(16, 12))\n",
|
|||
|
"dendrogram(linkage_matrix)\n",
|
|||
|
"plt.title('Дендрограмма агломеративной кластеризации (числовой признак \"price\")')\n",
|
|||
|
"plt.xlabel('Индекс образца')\n",
|
|||
|
"plt.ylabel('Расстояние')\n",
|
|||
|
"plt.show()\n",
|
|||
|
"\n",
|
|||
|
"# Получение результатов кластеризации (только для числовых данных)\n",
|
|||
|
"result = fcluster(linkage_matrix, t=100, criterion='distance') \n",
|
|||
|
"print(result) # Вывод результатов кластеризации (номера кластеров для каждого образца)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 11,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAABJ8AAAMQCAYAAACJzMTyAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACbM0lEQVR4nOzdeZxVdf0/8NesgMomIqiouOLCIiYulaVoZm5f/drivuVXzSVzyfKXpWaZFe6oaWYu6dclzeWbZblX7vuKpoIJygAiIAjMMNzfHzQTwz4wh5nB5/Px4MG955x7zvuce865577mcz63rFQqlQIAAAAABShv7QIAAAAAWHEJnwAAAAAojPAJAAAAgMIInwAAAAAojPAJAAAAgMIInwAAAAAojPAJAAAAgMIInwAAAAAojPAJAAAAgMIInwBYZgcffHAOPvjgBY6766670q9fv3z/+99fzlUBS+uYY47Jbbfd1tpl0EJ+//vf56ijjmrtMgD4FBM+AVCYCRMm5Nxzz23tMoBmuOOOO1JTU5N99923tUuhhey7774ZP358fv/737d2KQB8SgmfACjMj3/843zyySdZaaWVWrsUYAnMmDEjw4YNyzHHHJPycpeJK4qysrIcffTRueCCCzJjxozWLgeATyFXFQAU4i9/+Uvuu+++HHfccenevXuTcbNnz85VV12VL33pS+nfv3++/OUv54YbbmgyzcEHH5zvf//7+dWvfpXPfvaz+cxnPpNjjz02Y8aMaTLd/fffnwMOOCCDBw9O//79s+uuu+bGG29sHP/kk0+mX79++fvf/54DDzwwAwcOzC677JKbbrqpcZqf/exn6devX5544onGYXfccUf69euXO++8s7GeeW8tPP/889OvX7/ccccdSZJ+/frl0ksvbTLNpZdemn79+s1X83//939nwIAB+dznPpef/OQn+eSTT5pM88ILL+SII47IlltumW233TYnn3xyampqmqzTk08+mSR58803s/POO2e//fZb4u2SJFdddVV22mmnbLbZZunXr1/jv3nXYW7f//73M3To0Mbn11xzTQYPHpybb765yXZb0L+G7ZQkTz/9dL75zW9myJAh6d+/f4YOHZpLL700s2fPbpxm6tSpOeecc7L99ttniy22yL777puHH3648f1Y2HLm3i5HH310ttxyy2y55ZY57rjj8t577zXOf0n2jWT+97VUKmW//fZLv379Mnr06CTJzJkzc9ZZZ2W77bbLNttsk1NPPTWTJ09ufM2MGTNy/vnnZ5dddkn//v2z5ZZb5vDDD8/rr7++0G2bJKNHj26y7eZ93rDsnXbaqcl+9s9//rNx+867fRbl9ttvz8yZM7Pjjjs2GX7hhRcucFvPu6/cdttt2X333dO/f//ssMMOufTSS1NfX9+sdUySf/zjHwtc3tzH4Pe///0cfPDB+f3vf58dd9wxgwcPzqGHHpoRI0Y0mf+oUaPy7W9/O5/73OeyxRZb5OCDD86zzz473/Ln/tdQY79+/fK73/0u3/ve9zJ48OB89rOfzU9/+tPMnDmz8fX19fW56qqrsscee2TgwIHZYostst9++zU5nzScBwYPHpza2tom9X37299ucmvy3PXcfffdTaZ96KGH5nsfl2T5SbLjjjtm5syZuf3227Mwizp+536vx40bl9NPPz1f/OIXM3DgwHz1q1/NAw88sND5zj3vhmOmwdChQ5vclr2knw/zno/nPS8miz8HALD8VLZ2AQCseCZNmpSzzz47m2++eY488sjceuutTcafddZZueOOO3L00Udn8ODBefrpp3PuuedmypQpOe644xqne+CBB9K9e/ecccYZmT17ds4///wcfPDB+eMf/5hOnTrl4YcfznHHHZdDDjkkJ5xwQmbMmJGbbropP/7xj9O/f/8MGjSocV4nnXRS9t577xxzzDF54IEHcvbZZydJDjjggJx00kl5+OGHc+aZZ+aee+7JhAkT8tOf/jRf+cpXsvfeey9wHf/1r3/l2muvbfa2ueeee3Lqqadmzz33zHe+852MGTMmF154Yd5666389re/TVlZWV577bUcdNBBGTRoUH7xi1+kvr4+559/fr75zW82hmFz++Uvf5n+/fvnW9/6VpIs0Xa58847c/755+foo4/Odtttl06dOiVJvvGNbyzxutTU1OSCCy7Ij3/843zxi19sMm748OHp2bNnkmT8+PE5/vjjG8eNGDEihx12WHbddddceOGFKZVKueeeezJ8+PCsv/762X333VNfX58jjjiiMThYf/3184c//CHHHXdcrrvuupx55pmZOnVqY81f/epX87WvfS1JsuGGG2bkyJHZb7/9sv766+fnP/95Zs2alSuuuCL7779/7rrrrvTo0aOxnkXtGwty11135fnnn5/vPbjzzjvzwx/+MF26dMnZZ5+ds846KxdeeGGS5LTTTsszzzyTk08+Oeuss07efffdXHzxxTnllFPyxz/+MWVlZUu83ed19dVXz/eF/lvf+laqq6tzzjnnZPXVV095eXluu+22xd52dffdd2eHHXZIdXV1k+EzZszI0KFDc/TRRzcOm3dfufLKK3PhhRfmoIMOyumnn57XX389l156aT744INm3347Y8aM9O7dOxdffHHjsIb3ZW6vv/563nnnnZx88snp2rVrLrnkkhx00EG59957s/rqq+ett97K17/+9fTt2zdnnHFGqqqqcv311+fQQw/NNddck6233rrJNtthhx2SpMn6X3zxxRk0aFAuuuiivP3227nooosyfvz4XHTRRUmSYcOG5X//939zyimnpF+/fqmpqclll12WE088MQ8//HDjsZXMaYH0+OOPNx4v06ZNyyOPPLLAVmYrr7xyHnzwwey1116Nw+69996Ul5c3CWmXdPkdOnTIjjvumHvuuScHHnjgIrf/3Mdv0vS9njBhQr761a+mQ4cOOemkk9K9e/fccccdOe644/KLX/yiSb1LY0k/HxanOecAAIonfAKgxZ177rmZPHlyfvOb36SysulHzciRI3Prrbfm5JNPbuwA9/Of/3zKyspy5ZVX5oADDmhsKTV9+vTccccdWXvttZMk66+/fvbZZ5/ceeed2X///fPWW29ln332yQ9+8IPG+Q8ePDjbbLNNnnzyySbh05e+9KXG6bbffvuMGzcul19+efbff/907Ngx5513Xg444IBcddVVee6557LKKqss8Mvu3Ou40UYb5dVXX20cVl5enlmzZi30NaVSKcOGDcv222+fYcOGNQ7v27dvDjvssDzyyCPZYYcd8qtf/SrdunXLNddckw4dOiRJVl999Zxyyin55z//2WSe7777bv7+97/n7rvvzkYbbZQkS7RdXnrppXTr1i0nn3zyQutdnJtvvjmbbLJJ/vu//3u+cZtuumn69OmTJPMFIyNGjMhnP/vZ/PKXv2z80v25z30uDz74YJ588snsvvvuefTRR/Piiy/msssuy84775wk2XbbbfPee+/liSeeaBJmJUnv3r2zxRZbND4/88wz06lTp1x77bVZZZVVkiTbbbdddt5551x99dX53ve+1zjtovaNeUOhadOmZdiwYdl8882bvPelUimnnXZaYz9Jzz33XGOH3bW1tZk2bVrOOOOM7LbbbkmSrbfeOlOnTs15552XCRMmNPmi3xwffPBBfv3rXzepZ+LEiXnvvffywx/+MLvuumvjtH/7298WOa+pU6fm5Zdfzle+8pX5xk2fPj1rrrlmk208t48//jiXX355vvGNb+SMM85IMue47tatW84444wcfvjhjfvnkpg+fXq6dOnSZHkN7+O8y/3Vr36VrbbaKkkycODA7Lzzzrn++utz6qmnZvjw4amurs7111/f+Poddtghe+yxR37xi180CePWWWedBa7fqquuml/96leprKzMF7/4xZSXl+dnP/tZTjjhhGywwQYZN25cTjrppCYtcTp06JATTjghb7zxRpN5fuELX8gDDzzQGD49+OCD6dmzZ5Mwae5p//a3v6W2tjbV1dWZOXNmHnjggQwZMqRJ657mLH/AgAG59957M3Xq1AVuzwZ
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1200x800 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Выбираем подмножество данных для кластеризации\n",
|
|||
|
"features = df_encoded[['price']]\n",
|
|||
|
"\n",
|
|||
|
"# Масштабирование числовых признаков\n",
|
|||
|
"scaled_features = scaler.fit_transform(features)\n",
|
|||
|
"\n",
|
|||
|
"# Построение дендрограммы\n",
|
|||
|
"linkage_matrix = linkage(scaled_features, method='ward') \n",
|
|||
|
"\n",
|
|||
|
"plt.figure(figsize=(12, 8))\n",
|
|||
|
"dendrogram(linkage_matrix, labels=df.index, leaf_rotation=90, leaf_font_size=10)\n",
|
|||
|
"plt.title('Иерархическая кластеризация (дендрограмма) по цене')\n",
|
|||
|
"plt.xlabel('Индекс товара')\n",
|
|||
|
"plt.ylabel('Евклидово расстояние')\n",
|
|||
|
"plt.tight_layout()\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"**Визуализация распределения кластеров**"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 20,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAACbQAAAuoCAYAAAAwk66tAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzde5RXZaH/8c/IHQEBEUGJULMBRAQRBANBMjNFraRfWSIaiBc85A2VIiXFCsFIEFS85V1OXtC8YmVapnjJu6JmYKDcFEVBZBDm94eLOY5cRIWZ3Tmv11qzFrO/ez/f57u/s9bjcr3Xs0vKy8vLAwAAAAAAAAAAANVsi+qeAAAAAAAAAAAAACSCNgAAAAAAAAAAAApC0AYAAAAAAAAAAEAhCNoAAAAAAAAAAAAoBEEbAAAAAAAAAAAAhSBoAwAAAAAAAAAAoBAEbQAAAAAAAAAAABSCoA0AAAAAAAAAAIBCELQBAAAAQBUrLy+v7in8r+J+AgAAAPzvIWgDAAAAYL0GDBiQ0tLSSj8dOnRInz598otf/CJLlizZ4PVz585NaWlpbrnlliqa8ebx8MMP54QTTkivXr2y22675Zvf/GbGjBmTt9566zOP9cQTT2TIkCGbYZbVZ9asWRk1alT23XffdOzYMX369MnJJ5+cmTNnVjqvb9++OeOMMzbpe7/yyis57LDDNumYAAAAAFSfmtU9AQAAAACKrX379jnrrLMqfl+5cmWef/75/OY3v8mLL76YG264ISUlJeu8tnnz5pk6dWpat25dVdPd5MaNG5fLLrss+++/f372s5+lcePGeemll3LppZdm+vTpufbaa9OyZcuNHu/3v/99Xn311c0446o1ffr0nHbaadl5551z3HHHpVWrVpk/f36uuuqq/L//9/9y0UUX5Wtf+9pme/977rknTz755GYbHwAAAICqJWgDAAAAYIMaNGiQTp06VTrWtWvXLFu2LBMmTMjTTz+91utr1K5de72v/Se48847c+mll2bEiBE58sgjK4537949vXv3zne+852ce+65ufDCC6tvktXo3//+d04//fT06tUrv/3tb1OjRo2K1/bbb78cdthhOf300/PnP/85tWvXrsaZAgAAAPCfwiNHAQAAAPhcOnTokCR54403knz0eNJTTz01w4YNS6dOnXLUUUet85Gj//rXv3LCCSekW7du6dq1a4455phKO5atWLEi5513Xnr37p0OHTrkoIMOyl133bXeeaxYsSJdunTJmDFjKh3/8MMP071794wePTpJ8txzz2XgwIHp0qVLOnfunCOPPDJPPfXUBj/jlClT8pWvfCUDBw5c67U2bdpk+PDh6dy5c8rLy5Mkixcvzi9+8Yvss88+6dChQ7p165ahQ4dm7ty5SZIzzjgjt956a15//fVK92VjPvPKlSszbty47L333unYsWMGDRqUadOmpbS0tGL8JHnooYfywx/+MF26dMmee+6ZU045JfPmzat4/ZZbbkn79u3z+9//Pl/72tfSrVu3XHfddSktLc2sWbMqvedtt92Wdu3aVbr+46655pqUlZVl5MiRlWK2JKlXr15OP/30HHrooet8NO2MGTNSWlqaGTNmVDo+YMCADBgwoOL3DX1vEydOrIgJS0tLM3HixCTJ6tWrM2XKlHzjG99Ihw4d8s1vfjPXXHPNWu/zyb/XJLnjjjty8MEHp2PHjunevXtOPfXULFiwYJ2fHwAAAIBNT9AGAAAAwOeyJn760pe+VHHs7rvvzpZbbpmLLroogwcPXuuaBQsW5Pvf/35mz56dUaNGZezYsXnzzTczcODAvPPOOykvL8/QoUNz44035qijjspFF12Uzp0756STTsq0adPWOY86derkm9/8Zu6+++6KsCz5KOx6++23c8ghh2Tp0qUZPHhwmjRpkokTJ2b8+PFZvnx5Bg0alPfee2+d4y5atCgzZ85Mnz591vtI1R/+8IcZNGhQSkpKUl5enmOOOSYPPfRQTj311Fx++eU54YQT8vDDD1c8svX4449P7969s80222Tq1Knp06fPRn/mM888M1dddVUOP/zwTJo0Kc2aNcvPf/7zSvOZNm1afvzjH6dly5b5zW9+kxEjRuTJJ5/M97///bz11lsV561atSpXXHFFzj333IwYMSL9+vVLnTp1ctttt601Xo8ePdb7SNW//vWvad++fbbddtt1vt6jR4+cdNJJ2Wabbdb5+qf5tO/te9/7Xvr3758kmTp1ar73ve8lSUaNGpUJEybk4IMPzsUXX5z9998/v/zlLzNp0qRK43/y7/WJJ57Iaaedlv32269iZ75HHnkkp5xyyueaPwAAAACfnUeOAgAAALBB5eXl+fDDDyt+X7JkSR599NGK8GrNTm1JUqtWrfziF7+oeLzkx3cOS5Lf/e53KSsry5VXXlkRObVt2zaHHXZYnn766dSsWTN//etfM378+BxwwAFJkl69emX58uUZN25c+vXrl5o11/5fWoccckhuvvnmPPHEE9ljjz2SfPS40B133DG77rprnnrqqbz99ts54ogjsvvuuydJdtxxx0ydOjXLli1Lw4YN1xpzza5krVq12qj7tHDhwopdydbMYc8998y///3vTJ06NUnSunXrNG3atNKjWB966KFP/cxvvPFGbr311px++ukVO4n16tUrb775Zv72t78l+WhXsnHjxqVnz545//zzK+a1++6754ADDsjll1+e0047reL4sccemz59+lT8/o1vfCO33357fvKTn6SkpCTz58/PI488krFjx673M8+fPz/t2rXbqPvzefzzn//c4PfWokWLtGjRIkkq7uesWbPy3//93zn55JMzZMiQJEnPnj1TUlKSSy65JD/84Q/TpEmTJGv/vU6ZMiV169bNkCFDKo41btw4zz77bMrLy9cbNgIAAACw6dihDQAAAIANeuyxx7LLLrtU/Oy11145+eST06FDh5x//vmVIp8dd9yxIgRalyeeeCKdOnWqtGNXixYtcv/996d37955+OGHU1JSkt69e+fDDz+s+Onbt28WLVqUV155ZZ3jduvWLdttt13uvPPOJB89wvOPf/xjDjnkkCTJzjvvnKZNm+bYY4/NmWeemfvuuy/NmjXL8OHDK4KoT1oTzq1evXqj7tO2226bq6++Ol26dMncuXPz0EMP5Zprrsk//vGPlJWVrfe6jfnMM2bMSHl5efbff/9K1/br16/i37NmzcqiRYsqHUs+iug6d+6cRx99tNLxT4Zo/fv3z+uvv57HH388yUe7s2255Zb5xje+sd6516hRI6tWrdrwjfkCPs/39sgjj6S8vDx9+/Zd636uWLEiTzzxRMW5n/x77dq1a5YvX55+/frl/PPPz+OPP56ePXvmhBNOELMBAAAAVBE7tAEAAACwQbvsskt+8YtfJElKSkpSp06dtGzZMg0aNFjr3C233HKDY73zzjsb3PFszWNH1+zG9UkLFy5c545gJSUlOeigg/L73/8+I0eOzP3335/3338/Bx10UMW8rrvuulx00UW5++67M3Xq1NStWzeHHHJIRo4cuc4Ir2XLlikpKcnrr7++3vkuWbIkNWvWrPjct99+e37zm99k3rx5ady4cdq1a5e6det+6j35tM+8ePHiJMnWW29d6bWP//7OO+8kSZo1a7bWGM2aNcsLL7xQ6Vj9+vUr/d69e/e0atUq06ZNS9euXTNt2rQccMABqVOnznrnvt122+WNN95Y7+srV67MkiVL1jmnjfF5vrc19+HAAw9c55gLFiyoNP7Hde7cOVOmTMnvfve7XHnllZkyZUqaNWuWY489NgMGDPhcnwEAAACAz0bQBgAAAMAGbbnlltl11103yVgNGzasiLM+7uGHH06rVq3SsGHD1K9fP1dfffU6r//yl7+83rEPOeSQXHLJJZkxY0buuuuudO3aNdtvv33F6zvuuGPGjh2bVatW5Zlnnsltt92WG264Ia1bt87gwYPXGq9JkybZZZdd8te//jXDhw9f5w5dF154YW688cbcf//9mT17dk4//fQMGDAggwYNyrbbbpskOe+88yrtCraue/J
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 2500x3000 with 4 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"from sklearn.preprocessing import StandardScaler\n",
|
|||
|
"from sklearn.cluster import KMeans\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"\n",
|
|||
|
"# Закодирование категориальных переменных\n",
|
|||
|
"df_encoded = pd.get_dummies(df_cleaned, drop_first=True)\n",
|
|||
|
"\n",
|
|||
|
"# Выбор подмножества данных для кластеризации\n",
|
|||
|
"features = df_encoded[['price']]\n",
|
|||
|
"\n",
|
|||
|
"# Масштабирование данных\n",
|
|||
|
"scaler = StandardScaler()\n",
|
|||
|
"scaled_features = scaler.fit_transform(features)\n",
|
|||
|
"\n",
|
|||
|
"# Кластеризация данных\n",
|
|||
|
"kmeans = KMeans(n_clusters=3)\n",
|
|||
|
"df_encoded['Cluster'] = kmeans.fit_predict(scaled_features)\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация кластеров\n",
|
|||
|
"plt.figure(figsize=(25, 30))\n",
|
|||
|
"\n",
|
|||
|
"# Парный график 1: Price vs Category\n",
|
|||
|
"category_columns = [col for col in df_encoded.columns if col.startswith('category_')]\n",
|
|||
|
"if category_columns:\n",
|
|||
|
" plt.subplot(4, 1, 1)\n",
|
|||
|
" sns.scatterplot(x=df_encoded['price'], y=df_encoded[category_columns[0]], hue=df_encoded['Cluster'], palette='Set1', alpha=0.6)\n",
|
|||
|
" plt.title('Price vs Category Clusters')\n",
|
|||
|
" plt.xlabel('Price')\n",
|
|||
|
" plt.ylabel(f'Category ({category_columns[0]})')\n",
|
|||
|
"else:\n",
|
|||
|
" plt.subplot(4, 1, 1)\n",
|
|||
|
" plt.text(0.5, 0.5, 'No category columns found', ha='center', va='center', fontsize=12)\n",
|
|||
|
" plt.title('Price vs Category Clusters')\n",
|
|||
|
"\n",
|
|||
|
"# Парный график 2: Price vs Sub-Category\n",
|
|||
|
"sub_category_columns = [col for col in df_encoded.columns if col.startswith('sub_category_')]\n",
|
|||
|
"if sub_category_columns:\n",
|
|||
|
" plt.subplot(4, 1, 2)\n",
|
|||
|
" sns.scatterplot(x=df_encoded['price'], y=df_encoded[sub_category_columns[0]], hue=df_encoded['Cluster'], palette='Set1', alpha=0.6)\n",
|
|||
|
" plt.title('Price vs Sub-Category Clusters')\n",
|
|||
|
" plt.xlabel('Price')\n",
|
|||
|
" plt.ylabel(f'Sub-Category ({sub_category_columns[0]})')\n",
|
|||
|
"else:\n",
|
|||
|
" plt.subplot(4, 1, 2)\n",
|
|||
|
" plt.text(0.5, 0.5, 'No sub-category columns found', ha='center', va='center', fontsize=12)\n",
|
|||
|
" plt.title('Price vs Sub-Category Clusters')\n",
|
|||
|
"\n",
|
|||
|
"# Парный график 3: Price vs Category (другая категория)\n",
|
|||
|
"if len(category_columns) > 1:\n",
|
|||
|
" plt.subplot(4, 1, 3)\n",
|
|||
|
" sns.scatterplot(x=df_encoded['price'], y=df_encoded[category_columns[1]], hue=df_encoded['Cluster'], palette='Set1', alpha=0.6)\n",
|
|||
|
" plt.title('Price vs Category Clusters')\n",
|
|||
|
" plt.xlabel('Price')\n",
|
|||
|
" plt.ylabel(f'Category ({category_columns[1]})')\n",
|
|||
|
"else:\n",
|
|||
|
" plt.subplot(4, 1, 3)\n",
|
|||
|
" plt.text(0.5, 0.5, 'Not enough category columns found', ha='center', va='center', fontsize=12)\n",
|
|||
|
" plt.title('Price vs Category Clusters')\n",
|
|||
|
"\n",
|
|||
|
"# Парный график 4: Price vs Sub-Category (другая подкатегория)\n",
|
|||
|
"if len(sub_category_columns) > 1:\n",
|
|||
|
" plt.subplot(4, 1, 4)\n",
|
|||
|
" sns.scatterplot(x=df_encoded['price'], y=df_encoded[sub_category_columns[1]], hue=df_encoded['Cluster'], palette='Set1', alpha=0.6)\n",
|
|||
|
" plt.title('Price vs Sub-Category Clusters')\n",
|
|||
|
" plt.xlabel('Price')\n",
|
|||
|
" plt.ylabel(f'Sub-Category ({sub_category_columns[1]})')\n",
|
|||
|
"else:\n",
|
|||
|
" plt.subplot(4, 1, 4)\n",
|
|||
|
" plt.text(0.5, 0.5, 'Not enough sub-category columns found', ha='center', va='center', fontsize=12)\n",
|
|||
|
" plt.title('Price vs Sub-Category Clusters')\n",
|
|||
|
"\n",
|
|||
|
"# Настройка графиков\n",
|
|||
|
"plt.tight_layout()\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## KMeans (неиерархическая кластеризация) для сравнения"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 25,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Центры кластеров:\n",
|
|||
|
" [[ 194.76055021]\n",
|
|||
|
" [ 696.35470625]\n",
|
|||
|
" [1847.9773913 ]\n",
|
|||
|
" [5430.2962963 ]]\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAABi8AAASgCAYAAACAO9vxAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3yN9///8efJTiSxJXaMir1KbGIr2lpttShVm9qjOlTtqhmbRtUqRa1SVK1+tEI60KKlRikROyJTcn5/+OV8RWIdOec65XG/3T63T3Jd73Ndz+udS3quvM77/TaZzWazAAAAAAAAAAAAHIST0QEAAAAAAAAAAADuRvECAAAAAAAAAAA4FIoXAAAAAAAAAADAoVC8AAAAAAAAAAAADoXiBQAAAAAAAAAAcCgULwAAAAAAAAAAgEOheAEAAAAAAAAAABwKxQsAAAAAAAAAAOBQKF4AgI2YzWajI8CBcX84Nn4+AADgWcR7IPxXcK8CzwaKF8BTpEOHDurQoUOa7dHR0Xr11VdVunRpbd++3dI2MDBQbdu2ve/xBgwYoMDAQL377rs2y2wr8fHxWrRokVq3bq3nn39eQUFBatu2rdatW5fqTc6MGTMUGBiYoedOSEjQuHHjtHHjxgw53v1+rvZw7tw5BQYGpvpf8eLFVaFCBbVq1UqrV69+6DFs0cf29qj306OaPXu2QkNDbZDUOD169NCqVask/d/PfNGiRem2fffdd1WvXr0nPue7776b5t4sX768XnzxRc2cOVNxcXGPfcyIiAh169ZN//7772O9bvr06Ro5cuRjnw8AgGcdzzD/h2eYjMEzjHT79m0tWrRILVu2VPny5VWhQgW1bNlSCxcuVEJCwmMfLzAwUDNmzLA6T3JyslatWqV27dqpSpUqqlixolq2bKklS5ZYlWfVqlX65JNPrM4D4L/DxegAAGwrOjpaXbp00bFjxzRr1izVqVPHss/JyUm//fabIiIi5O/vn+p1MTEx2rlzp73jZojLly+rS5cuunDhgjp06KCyZcsqOTlZO3fu1Lvvvqvw8HCNHj1aJpPJJuePjIzUF198ofHjx2fI8T766KMMOc6T6Nmzp4KDgyXd+YTLrVu3tGrVKr3//vu6ffv2Ax8gX3nlFdWqVctOSTOeLe6n6dOnq0+fPjZMbV9ff/21Ll68qNatW6faPnXqVNWtW1cFCxa02blz5sypmTNnSrrzUHTz5k2Fh4dr3rx5+t///qcvvvhC7u7uj3y8H3/8Ubt3737sHN26dVPjxo3VuHFjVatW7bFfDwAA/g/PMDzDZIRn+Rnmww8/1LZt29StWzeVLl1aycnJCg8P17Rp0/Tzzz9r1qxZdssSGxurHj166ODBg3r99dfVpUsXubq6at++fZo4caL27NmjWbNmyc3N7ZGPOWfOHAUFBdkwNQBHQfECeIqlvOk/evSo5syZoxo1aqTaX7JkSZ04cUJbtmxRp06dUu3buXOnPD095evra8fEGWPYsGGKiIjQypUrFRAQYNkeHBysPHnyaMqUKapbt67q169vXMjHULRoUaMjqECBAipfvnyqbdWrV9exY8e0aNGiB77x9/f3T/Ng+V/ytN1PGS0uLk6TJk3SRx99JCen1AM63dzc9N5772np0qU2e9B2c3NLc2/WqVNH5cqVU+/evbVw4UL17NnTJue+m6enpzp27Kjx48drw4YNNj8fAABPK55hno73nDzDGOf8+fNau3atRo0apVdffdWyvVatWsqWLZvGjRunQ4cOqWzZsnbJM378eP3yyy9asmRJqp9HzZo1Vbx4cQ0aNEgrVqzQm2++aZc8AP5bmDYKeErdunVLXbt21Z9//qn58+enedMvSV5eXqpTp462bNmSZt/mzZvVuHFjubikrnEmJydr/vz5atiwoUqXLq3GjRtryZIlqdokJSVp/vz5at68ucqWLavy5curbdu22rdvn6XNjBkz1LBhQ+3atUsvvvii5Vjr1q1LdawvvvhCTZo0UZkyZVSrVi2NHDlS0dHR973uo0eP6n//+5/efvvtVG/6U3Tq1Ent2rWTl5dXuq+vV69emiHmX3/9tQIDA3Xu3DlJd/5YO3LkSNWuXVulS5dWkyZNLFMAnTt3zvJAMXz48FRT44SHh6t9+/YqV66cgoKCNGzYMF29ejXVeUqWLKlVq1apRo0aCgoK0okTJ9IMuQ4MDNSyZcv0/vvvKygoSBUqVFC/fv10+fLlVLlDQ0NVv359lS1bVm3bttWOHTsUGBiosLAwS9YnGf7r5OSkEiVK6Pz586mO9/nnn6tJkyYqV66c1qxZk+6Q63Xr1qlly5YqV66cgoODNXny5FTDhf/66y91795dFStWVMWKFdW7d2+dPXv2vlk2btyowMBA/fXXX6m2b9++XYGBgTpy5Igk+9xPBw4c0Ntvv63KlSurdOnSqlevnmbMmKHk5GRJsvTFzJkzU/XLo1zz33//ra5du6pixYqqXr26pk6dquHDh6e6P+Lj4zVr1izLdTZq1Ejz58+3nF+6M4x/8ODB6tu3r8qXL6+33npLrVu3TvcBrlOnTnrrrbfu20dr1qxRfHy86tatm2ZfyqcEFy9efN/Xp9i7d6/eeOMNPf/886pSpYoGDRqkCxcuPPR199OgQQOVL19eK1assGx72O+mr7/+WsOHD5ck1a9f3/K7IC4uTpMnT1ajRo1UunRpVaxYUW+99ZaOHj2a6pzNmzfX8ePHtWvXLqtzAwDwLOMZhmcYnmGe/Bnm8uXLMpvNqd7/p3jxxRc1cOBAS4Hv3vskRXr3VHR0tAYPHqwKFSqoWrVqGjNmjGJjY++bQ5KuXr2qNWvWqHXr1mkKSdKd98+dO3eWn5+fZduxY8fUp08fVa1aVaVKlVKtWrU0ZswYy5Sw9erV07///qu1a9emyn7+/HkNHDhQQUFBKleunDp27GjpwxSRkZEaMGCAgoKCVLlyZY0YMUJTp05Ndc8nJSVp2bJlevHFF1W2bFkFBwdr0qRJio+Pt7R599131bFjR3300UeqWLGimjZtqr59+6p27dpp+v39999X48aNH9hPAO6PkRfAUygmJkbdunXTkSNHFBoaqkqVKt23bdOmTdW/f/9Uw66jo6O1Z88eff7559qzZ0+q9iNHjtTXX3+t7t27q0KFCjpw4IDGjRunqKgo9e7dW5I0adIkffnllxo0aJACAwN18eJFzZo1S/369dOuXbvk6ekpSbp06ZJGjRqlnj17Km/evAoNDdWwYcNUpkwZFSlSRN98840+/fRTDRs2TIGBgTp58qQ++eQTxcbG3nd+yx9++EGS7jufvru7u0aMGPF4HXqPcePG6X//+5+GDRumHDlyaM+ePZo4caKyZMlimWu/T58+6tmzpxo1aiTpzh+033rrLVWtWlXTpk3TjRs3NH36dL355ptavXq1PDw8JN15o7Rw4UKNHTtW165dU5EiRdLNMHXqVDVs2FBTpkzR2bNnNX78eDk7O2vKlCmS7vxhfNasWXr77bdVtWpV/fDDD+rfv3+qY+TKlUsrV658ok8UnTp1SgUKFEi1bcaMGXr//ffl7e2tcuXKWdZBSLFs2TKNGjVKr7zyigYOHKizZ89q4sSJunHjhkaNGqVTp06pbdu2Kly4sD755BPdvn1bc+bM0euvv67169cre/bsaXI0aNBAXl5e2rRpk4oVK2bZ/s033+i5555TyZIl7XI/HTt2TJ06dVKTJk00depUmc1mbdy4UTNnzlThwoXVrFkzrVy5Uq+99pratGmjV155xdKPD7vmq1evqn379sqePbvGjx+vpKQkTZ8+XefPn7c8CJjNZvXo0UO//fab+vTpo+LFiyssLEzTpk3T2bNnNXr0aEvWb7/9Vi+99JLmzJmj5ORk/fPPPxo5cqTOnDljmebpwoULCgsL08SJE9O9fknasGGDgoOD0x3m3bp1a23ZssUyfdS990qKdevWadiwYWrevLm6d++ua9euKSQkRK+99prWrl2b7s/8UdSoUUOzZs3Sv//+q7x58z70d1NwcLB69uypOXPmpCouDR06VOHh4Ro4cKAKFCi
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1600x1200 with 4 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"from sklearn.preprocessing import StandardScaler\n",
|
|||
|
"from sklearn.cluster import KMeans\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"\n",
|
|||
|
"# Загрузка данных\n",
|
|||
|
"df = pd.read_csv(\"..//static//csv//jio_mart_items.csv\")\n",
|
|||
|
"df = df.iloc[:15000]\n",
|
|||
|
"\n",
|
|||
|
"# Удаление несущественных столбцов\n",
|
|||
|
"columns_to_drop = ['href', 'items']\n",
|
|||
|
"df_cleaned = df.drop(columns=columns_to_drop)\n",
|
|||
|
"\n",
|
|||
|
"# Закодирование категориальных переменных\n",
|
|||
|
"df_encoded = pd.get_dummies(df_cleaned, drop_first=True)\n",
|
|||
|
"\n",
|
|||
|
"# Выбор подмножества данных для кластеризации\n",
|
|||
|
"features_used = ['price']\n",
|
|||
|
"data_to_scale = df_encoded[features_used]\n",
|
|||
|
"\n",
|
|||
|
"# Масштабирование данных\n",
|
|||
|
"scaler = StandardScaler()\n",
|
|||
|
"data_scaled = scaler.fit_transform(data_to_scale)\n",
|
|||
|
"\n",
|
|||
|
"# Кластеризация данных\n",
|
|||
|
"random_state = 42\n",
|
|||
|
"kmeans = KMeans(n_clusters=4, random_state=random_state)\n",
|
|||
|
"labels = kmeans.fit_predict(data_scaled)\n",
|
|||
|
"centers = kmeans.cluster_centers_\n",
|
|||
|
"\n",
|
|||
|
"# Отображение центроидов\n",
|
|||
|
"centers_original = scaler.inverse_transform(centers) # Обратная стандартизация\n",
|
|||
|
"print(\"Центры кластеров:\\n\", centers_original)\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация результатов кластеризации KMeans\n",
|
|||
|
"plt.figure(figsize=(16, 12))\n",
|
|||
|
"\n",
|
|||
|
"# Парный график 1: Price vs Category\n",
|
|||
|
"plt.subplot(2, 2, 1)\n",
|
|||
|
"category_columns = [col for col in df_encoded.columns if col.startswith('category_')]\n",
|
|||
|
"if category_columns:\n",
|
|||
|
" sns.scatterplot(x=df_cleaned['price'], y=df_encoded[category_columns[0]], hue=labels, palette='Set1', alpha=0.6)\n",
|
|||
|
" plt.title('KMeans Clustering: Price vs Category')\n",
|
|||
|
" plt.xlabel('Price')\n",
|
|||
|
" plt.ylabel(f'Category ({category_columns[0]})')\n",
|
|||
|
"else:\n",
|
|||
|
" plt.title('KMeans Clustering: Price vs Category (No Data)')\n",
|
|||
|
" plt.xlabel('Price')\n",
|
|||
|
" plt.ylabel('Category')\n",
|
|||
|
"\n",
|
|||
|
"# Парный график 2: Price vs Sub-Category\n",
|
|||
|
"plt.subplot(2, 2, 2)\n",
|
|||
|
"sub_category_columns = [col for col in df_encoded.columns if col.startswith('sub_category_')]\n",
|
|||
|
"if sub_category_columns:\n",
|
|||
|
" sns.scatterplot(x=df_cleaned['price'], y=df_encoded[sub_category_columns[0]], hue=labels, palette='Set1', alpha=0.6)\n",
|
|||
|
" plt.title('KMeans Clustering: Price vs Sub-Category')\n",
|
|||
|
" plt.xlabel('Price')\n",
|
|||
|
" plt.ylabel(f'Sub-Category ({sub_category_columns[0]})')\n",
|
|||
|
"else:\n",
|
|||
|
" plt.title('KMeans Clustering: Price vs Sub-Category (No Data)')\n",
|
|||
|
" plt.xlabel('Price')\n",
|
|||
|
" plt.ylabel('Sub-Category')\n",
|
|||
|
"\n",
|
|||
|
"# Парный график 3: Price vs Category (другая категория)\n",
|
|||
|
"plt.subplot(2, 2, 3)\n",
|
|||
|
"if len(category_columns) > 1:\n",
|
|||
|
" sns.scatterplot(x=df_cleaned['price'], y=df_encoded[category_columns[1]], hue=labels, palette='Set1', alpha=0.6)\n",
|
|||
|
" plt.title('KMeans Clustering: Price vs Category')\n",
|
|||
|
" plt.xlabel('Price')\n",
|
|||
|
" plt.ylabel(f'Category ({category_columns[1]})')\n",
|
|||
|
"else:\n",
|
|||
|
" plt.title('KMeans Clustering: Price vs Category (No Data)')\n",
|
|||
|
" plt.xlabel('Price')\n",
|
|||
|
" plt.ylabel('Category')\n",
|
|||
|
"\n",
|
|||
|
"# Парный график 4: Price vs Sub-Category (другая подкатегория)\n",
|
|||
|
"plt.subplot(2, 2, 4)\n",
|
|||
|
"if len(sub_category_columns) > 1:\n",
|
|||
|
" sns.scatterplot(x=df_cleaned['price'], y=df_encoded[sub_category_columns[1]], hue=labels, palette='Set1', alpha=0.6)\n",
|
|||
|
" plt.title('KMeans Clustering: Price vs Sub-Category')\n",
|
|||
|
" plt.xlabel('Price')\n",
|
|||
|
" plt.ylabel(f'Sub-Category ({sub_category_columns[1]})')\n",
|
|||
|
"else:\n",
|
|||
|
" plt.title('KMeans Clustering: Price vs Sub-Category (No Data)')\n",
|
|||
|
" plt.xlabel('Price')\n",
|
|||
|
" plt.ylabel('Sub-Category')\n",
|
|||
|
"\n",
|
|||
|
"# Настройка графиков\n",
|
|||
|
"plt.tight_layout()\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### PCA для визуализации сокращенной размерности"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 28,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAABjAAAAJICAYAAADPWa1BAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3yT5frH8W+SJm26Szd7I3vKUEFABbfgOE4EjgMExwFRRAVRD6gIiCKgKDgAfy4UcYHgFhFFQRyg7F1K927TJL8/OERKBw30aUL7eb9evmyfkefK1bTkyvXc921yu91uAQAAAAAAAAAA+BGzrwMAAAAAAAAAAAA4Hg0MAAAAAAAAAADgd2hgAAAAAAAAAAAAv0MDAwAAAAAAAAAA+B0aGAAAAAAAAAAAwO/QwAAAAAAAAAAAAH6HBgYAAAAAAAAAAPA7NDAAAAAAAAAAAIDfoYEBALWI2+32dQgoR23+2dTm5w4AAICSeG9YO/BzBlBZNDAAaMiQIWrVqlWJ/9q1a6e+ffvq0UcfVWZmZqlzdu7cqcmTJ+v8889Xhw4d1LdvX40dO1Zbtmwp9zrPPPOMWrVqpccff9zIp1Ou2bNnq1WrVj65dlnee+89tWrVSvv27TP8vKKiIk2dOlUffviht2F65brrrlOrVq20cuVKQ6/jbz/LU5GVlaX7779f69ev92wbMmSIhgwZUm0xVPb3uX///nrggQeq9Npbt27V9ddfXyWPtW/fPrVq1UrvvfdelTweAADwH9QsvlGTapZWrVpp9uzZpbb//fff6tWrl84991zt2rXLc2yrVq00c+bMMh/L5XKpd+/ep+17z0OHDmnatGm68MIL1bFjR51zzjkaOXJkiZpEMqYuSUpK0u233679+/dXyeOV93MFUHPQwAAgSWrTpo3eeustz3+vvPKKhg0bpqVLl2rEiBEl7o747LPPNHjwYP3xxx+644479NJLL2nMmDHatWuX/vWvf2nNmjWlHt/lcmnZsmVq2bKlPvjgA+Xn51fn06v1kpOT9dprr6m4uNiwa+zYsUMbNmxQy5Yt9eabbxp2nZpm8+bN+uCDD+RyuTzbHnnkET3yyCPVcv2T+X2uSitWrNCGDRuq5LHi4uL01ltvqW/fvlXyeAAAwL9Qs9Rs1VGzHG/r1q0aNmyY7Ha7Fi9erMaNG3v2mc1mrVixoszzfvrpJyUnJ1dTlFXr559/1hVXXKEvv/xSN998s1544QU99NBDKigo0JAhQ7Rs2TJDr//999/r66+/rrLHe+utt3TNNddU2eMB8D8Bvg4AgH8IDQ1Vp06dSmw788wzlZubq+eee06//vqrOnXqpD179mj8+PHq3bu3Zs2aJYvF4jl+wIABuv766zV+/Hh98cUXstlsnn3fffedkpKSNHPmTN1000366KOPeJNRw7z33nuqV6+eRowYoXHjxmn37t1q1KiRr8M6LTVv3rxarnOyv8/+ymazlfo7BgAAag5qFlSl7du3a+jQoQoJCdFrr72munXrltjfpUsXrV+/Xn/++afatGlTYt/HH3+s1q1ba/PmzdUZ8inLyMjQf/7zHzVu3FivvPKK7Ha7Z9/AgQN1++23a9KkSTrnnHMUExPjw0grj/f/QM3HCAwAFWrXrp0k6cCBA5KkRYsWqaioSA8//HCJQkCS7Ha7xo8fr6uuuqrUEO6lS5eqZcuW6tq1q3r06KG33nrrhNfu37+/pk6dqqFDh6pDhw566KGHJB150zVp0iSdddZZat++vf71r39p7dq1Jc4tLCzUE088obPPPludO3fWhAkTVFhYWOKYsobDrlu3Tq1atdK6des823bs2KE777xT3bt315lnnqkRI0Zo+/btJa41bdo0nXvuuWrXrp0uu+wyffLJJyUe1+Vyae7cuerbt686duyoUaNGlTnM/XiVPW/16tW64YYb1LlzZ7Vr104XXnihlixZIunItDrnnXeeJGnChAnq37+/57x33nlHV155pTp16qQOHTroiiuu0KefflrisVu1anXCaYOcTqeWLVumfv366fzzz1dwcHCZP2OHw6Hp06erT58+6tChg2655RYtW7as1PDy999/XxdffLHat2+vyy+/XGvXrlWbNm0qHJ79ySef6Morr1Tnzp119tlna9KkSSVyNXv2bF144YVatWqVLr30UrVv315XXHGFNmzYoI0bN+qaa65Rhw4ddOmll5Z6Pf39998aMWKEunTpoi5dumj06NHau3evZ//R182bb76pfv36qUuXLp67+irK8bp163TzzTdLkm6++WbP6/HY1+a///1vXXnllaWe76hRo3T55Zd7vl+/fr1uuukmdezYUd27d9f48eOVlpZWbr6kk/99PvY5H/u7cnzskvT7779r6NCh6tq1qzp37qxhw4Zp48aNko78TJ5//nlJJYd+u1wuzZ8/XxdccIHatWungQMHatGiRaWuM27cON19993q1KmThg8fXmoKqffee09t2rTRr7/+qmuvvVbt27dXv379tGDBghKPlZycrDFjxnh+xydNmqRnnnmmxO8KAADwX9Qs1CyVqVmOtX37dt18880KCwvT4sWLSzUvpCPNsZiYmFKjMIqLi/XZZ5/pkksuKXVOZX7uaWlpevTRR9WvXz+1a9dO3bt31+jRo0vUQ0OGDNFDDz2k+fPnq2/fvmrfvr2uu+46bdq0yXNMQUGBJk+erD59+njyefz73OMtW7ZMycnJevDBB0s0L6QjI07GjRunG2+8UTk5OaXOLW+61gceeKDEz2vPnj0aOXKkevTooY4dO+raa6/1jLh47733NGHCBEnSeeedV+Jn9s477+iSSy7xTA03e/ZsOZ3OEtcZOnSoHnnkEXXp0kUXX3yxnE5niTri6O/G2rVr9e9//1sdO3bU2WefraeffrrEY+Xk5GjSpEnq1auXOnfurDFjxujVV1/1q+nbAPyDBgaACu3cuVOS1KBBA0nSt99+qzZt2ig+Pr7M43v16qUxY8YoNjbWsy0jI0NffPGFBg0aJEkaPHiwfvvtN/3xxx8nvP6SJUvUvn17zZ07V1dffbUKCws1dOhQff755xozZoyef/55JSQk6NZbby3xxvC+++7T22+/rREjRmjWrFnKzMzUq6++6vXzP3TokK699lrt2rVLkydP1tNPP62UlBQNHTpUGRkZcrvdGj16tN58800NHz5c8+bN87wBOnbo7dNPP605c+bo6quv1vPPP6/IyEjNmDHjhNevzHlfffWVRo8erbZt22ru3LmaPXu2GjRooMcee0y//vqr4uLiPB8S33HHHZ6vlyxZokmTJun888/Xiy++qOnTp8tms2ncuHFKSkryPP5bb72lUaNGVRjnN998o8OHD2vQoEEKCgrSRRddpPfff19FRUUljps0aZJee+013XTTTZozZ45iYmI0ceLEEscsW7ZMDzzwgLp06aK5c+dq4MCBGjVqVIk3nMebO3euxo4dq06dOum5557T6NGjtXLlSg0ZMkQFBQWe45KSkvTkk09q5MiRevbZZ5WVlaW7775bY8eO1TXXXKM5c+bI7XZrzJgxnvN27typ6667TqmpqXrqqac0ZcoU7d27V9dff71SU1NLxPH8889r/PjxmjRpkjp37nzCHLdt21aTJk3y5KasaaMuv/xy/fHHH9q9e7dnW1ZWlr755htdccUVko4MYR82bJiCgoI0a9YsPfjgg/rxxx918803l3j+xzuZ32dv5OTk6NZbb1VUVJRmz56tZ555Rvn5+brllluUnZ2ta665RldffbWkkkO/J0+erOeee06XX365XnjhBV144YWaOnWq5syZU+LxP/30U4WEhGjevHm69dZby4zB5XLpP//5jy6++GLNnz9fXbp00bRp0/Ttt99KOjLX8tChQ/XLL7/owQcf1BNPPKEtW7Zo4cKFJ/WcAQBA9aNmoWapTM1y1I4dOzR06FCFhoZq8eLF5b5OLBaLBg4cWKqBsXbtWhUWFpa62aUyP3e3260RI0ZozZo1GjdunBYsWKA777xTa9euLVULrFy5Up9//rkefvhhzZw5UykpKbrrrrs8ddH
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1600x600 with 2 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Применение PCA ТОЛЬКО к числовым данным\n",
|
|||
|
"pca = PCA(n_components=1)\n",
|
|||
|
"reduced_data = pca.fit_transform(data_scaled)\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация сокращенных данных\n",
|
|||
|
"plt.figure(figsize=(16, 6))\n",
|
|||
|
"\n",
|
|||
|
"# График 1: PCA reduced data: Agglomerative Clustering\n",
|
|||
|
"plt.subplot(1, 2, 1)\n",
|
|||
|
"sns.scatterplot(x=range(len(reduced_data)), y=reduced_data[:, 0], hue=result, palette='Set1', alpha=0.6)\n",
|
|||
|
"plt.title('PCA reduced data: Agglomerative Clustering')\n",
|
|||
|
"plt.xlabel('Sample Index')\n",
|
|||
|
"plt.ylabel('Principal Component 1')\n",
|
|||
|
"\n",
|
|||
|
"# График 2: PCA reduced data: KMeans Clustering\n",
|
|||
|
"plt.subplot(1, 2, 2)\n",
|
|||
|
"sns.scatterplot(x=range(len(reduced_data)), y=reduced_data[:, 0], hue=labels, palette='Set1', alpha=0.6)\n",
|
|||
|
"plt.title('PCA reduced data: KMeans Clustering')\n",
|
|||
|
"plt.xlabel('Sample Index')\n",
|
|||
|
"plt.ylabel('Principal Component 1')\n",
|
|||
|
"\n",
|
|||
|
"plt.tight_layout()\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### Анализ инерции для метода локтя (метод оценки суммы квадратов расстояний)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 29,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA2cAAAImCAYAAADXOPIYAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAB86UlEQVR4nO3dd1zVZf/H8fdhgwwFZako7gW4cJtmZkPrTm3dpZWpqVn+SrvtNm1ndSe5szQtG5YjzYbtnZo7J5iiYg6GgoDIhvP7Azl5ZIpwzgFez8eDB/Jd58PhKnl7fb7X12A0Go0CAAAAAFiVnbULAAAAAAAQzgAAAADAJhDOAAAAAMAGEM4AAAAAwAYQzgAAAADABhDOAAAAAMAGEM4AAAAAwAYQzgAAAADABhDOAAAAAMAGEM4AAAAAwAYQzgDAAkaOHKnWrVvr7rvvLvGYxx9/XK1bt9Z///tfC1YGoKJOnjyp1q1ba926ddYuBUANQTgDAAuxs7PT7t27FRcXV2Rfenq6fv75ZytUBQAAbAXhDAAspF27dnJ2dtY333xTZN/PP/8sV1dX+fn5WaEyAABgCwhnAGAhbm5u6tevX7Hh7KuvvtINN9wgBweHIvt++OEHDRs2TCEhIerdu7deeuklpaenS5IGDBig1q1bF/tx8uRJSdKmTZt0zz33qEuXLurevbumTJmi2NhYs9eYMmVKsdcoq12rsF2zuI9L7du3T6NHj1b37t3VuXNnjR8/XocPHzbt37p1q1q3bq2tW7dKkg4dOqSBAwfq7rvv1oIFC0p8jQULFkiS1qxZo5tuukkdOnQw219Wi+jq1auLve6l5xW2rpV1XEVrKO97U9rrl7S/8Ofw3//+VwMGDDB73ZUrV5q9h5e+zs6dO82O/fDDD9W6dWuza2RmZur111/XoEGD1KFDB3Xu3FmjRo1SVFSU2bkl1TVy5EizYwrrKM7l46PQyJEjza6TlZWlN954QzfeeKNCQkI0aNAgLVmyRPn5+WbnXF7L1q1by3VuWYxGo6ZNm6bQ0FBt3Lix3OcBQKGivwUAAKrMzTffrMcee0xxcXHy9/eXJKWlpem3337Tu+++q99++83s+C+++EJPPPGEbrnlFj322GM6deqU5syZo+joaL377rtauHChsrOzdebMGT3yyCOaMGGC+vfvL0ny9fXV+vXr9eSTT2rIkCEaN26czp07p/nz5+uuu+7Sp59+Kh8fH0kFv9TeddddGjZsmCSZrlce7dq107PPPmv6es2aNfrkk09MX2/ZskVjxoxR9+7d9fLLLysrK0uLFy/W3XffrdWrV6t58+ZFrjlr1ix16NBBEyZMkJeXl/r27StJev755yXJ9Hr+/v7avn27ZsyYodtvv10zZsxQnTp1JKlc9WdmZiokJEQzZswwbSvpvEvf28uPq2gNV/LePPPMM2rfvn2xr79q1SpJ0oEDB/TCCy8UOfZyKSkpmjt3brH76tSpo59++kldunQxbfvqq69kZ2f+77lTp07Vjh07NHnyZAUFBen48eOaN2+epkyZog0bNshgMJiOvf3223XHHXeYvi78OVYmo9Go8ePHa/fu3XrkkUfUpk0bbd26VXPnztWJEyf04osvmo69fMw2b9683OeW5qWXXtKXX36pN954Q3369Kn07xFAzUc4AwAL6t+/v1xdXfXNN9/ogQcekCR9//338vHxMftlWCr4ZTMiIkJ9+/ZVRESEaXvTpk31wAMP6NdffzWFhcJZsqCgIHXs2FGSlJ+fr4iICPXp00evv/666fzOnTvr5ptv1rJlyzR16lRJUkZGhpo2bWo6t/B65eHu7m46T5J+//13s/2vv/66mjRpoiVLlsje3l6S1KdPH11//fWaP3++5s2bZ3b88ePHtXHjRn3++edq2bKlJJmCrLu7uySZvd6GDRskSU899ZQpFEmSk5NTmbVnZGSofv36Ztcr6bxL39vLj9u7d2+FariS96ZFixYlvn7h9qysrGKPvdz8+fMVGBioc+fOFdl3zTXX6Mcff9R//vMfSVJcXJz+/PNPde3aVadOnZIkZWdn68KFC5oxY4ZuvvlmSVK3bt2UlpamV199VWfPnlWDBg1M1/T39zerp/DnWJl+++03bd68WbNnz9bgwYMlSb1795aLi4vmzZun++67zzSeLh+zv/76a7nPLcnrr7+uVatWaeHChbrmmmsq/fsDUDvQ1ggAFuTi4qIBAwaYtTZu2LBBN910k9lMgyQdPXpUcXFxGjBggHJzc00f4eHhcnd316ZNm0p9rWPHjunMmTMaMmSI2fagoCB16tRJ27ZtM22LjY2Vh4dHJXyH5tLT07Vv3z7ddNNNpvAhSZ6enrr22mvNaig8fs6cOerevXuZvwwXCg0NlSS98847SkhIUHZ2tnJzc8t1bmV93xWp4Urfm8py6NAhrVq1Sk8//XSx+wcMGKCYmBgdPXpUkvTNN98oLCxMDRs2NB3j5OSkZcuW6eabb1Z8fLy2bNmilStXmha1yc7OvuK68vPzlZubK6PRWOYxhR+XHrtt2zY5ODjoxhtvNDvn1ltvNe0vydWcK0krVqzQkiVLNHjwYLPZVQC4UsycAYCF3XTTTXrkkUcUFxcnZ2dn/fHHH3rssceKHJecnCypoAWsuDawhISEUl+n8Pz69esX2Ve/fn1FRkZKKpihO336tBo1anRl30g5nD9/XkajscQazp8/b7Zt/Pjx8vT0NGuLLEt4eLhmzJihJUuWaOHChVdU36lTp0pt/6vKGq70vaksL730kgYPHqxOnToVu9/Pz08dOnTQjz/+qGbNmumrr77SkCFDTOOl0O+//66XX35ZR48eVZ06ddSmTRu5ublJUqkBqySLFi3SokWLZG9vr/r166tPnz76v//7P7NFcgpnmy/VrVs3SQWtmvXq1TMLupJMM3ilvZ9Xc64kHTx4UH369NGXX36p+++/X+3atSv1eAAoCeEMACzsmmuuUZ06dfTNN9/Izc1NjRo1UocOHYoc5+npKang3p7CX0Av5eXlVerr1K1bV5J09uzZIvvOnDmjevXqSZKioqKUmZlZZBGPyuDh4SGDwVBiDYU1Fpo6daq++eYbTZo0SStWrCh3+9udd96pjRs3Kjc3V88884waNWqkCRMmlHpOfn6+9uzZo+HDh5frNS6f2bzaGq70vakMX3/9tfbv32/W5lqc6667Tj/++KNuuukm7d+/XwsXLjQLZ3///bcmTpyogQMHavHixWrcuLEMBoNWrFhRpK1VKvu9kwrevzvvvFP5+fk6ffq05syZo7Fjx+rzzz83HfP888+bhelL7xvz8vLSuXPnlJeXZxayCv8Ro3C8F+dqzpWk//u//9N9992nwYMHa8aMGVqzZk2RoAcA5UFbIwBYmJOTkwYOHKhvv/1WX3/9tekel8s1a9ZMPj4+OnnypEJCQkwffn5+ev3114vMZFwuODhYDRo00Jdffmm2/cSJE9q9e7c6d+4sSfrll1/Utm1beXt7X/H3kp+fX+ovoW5uburQoYO+/vpr5eXlmbafP39ev/zyS5H77Dp06KCFCxfq1KlTmjVrVrnrmDdvnn755Re9+uqruummmxQSElLm/V67du1Senq6unfvXupxhbNAly+IcbU1XOl7c7Wys7P12muvaeLEiWb3gxVn4MCB2rNnjz788EN16dJFvr6+Zvv379+vrKwsPfTQQwoKCjKFr8JgVvieFa50WNZ7JxUsYBMSEqKwsDDddNNNuvfee/XXX38pJSXFdExwcLDZfwuX3t/XrVs35ebmFlkNtTDclfZ+Xs25UsFMp4uLi5555hkdOHBA7777bpnfLwAUh5kzALCCm2++WePGjZOdnZ3ZSoGXsre31+OPP65nnnlG9vb2uvbaa5WamqpFixYpPj6+zHY8Ozs7TZ48WdOmTdOUKVN066236ty5c1q4cKG8vLw0atQoHThwQCtWrNDgwYO1e/du07lnzpyRVDBDkpSUVCS4JSUlKTo6WsePHzeFvJJMmTJFo0eP1kMPPaR77rlHOTk5WrJkibKzszVx4sQix/v5+emxxx7TzJkzNXz
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Список для хранения инерций\n",
|
|||
|
"inertias = []\n",
|
|||
|
"clusters_range = range(1, 11)\n",
|
|||
|
"\n",
|
|||
|
"# Вычисление инерции для каждого количества кластеров\n",
|
|||
|
"for i in clusters_range:\n",
|
|||
|
" kmeans = KMeans(n_clusters=i, random_state=random_state)\n",
|
|||
|
" kmeans.fit(data_scaled)\n",
|
|||
|
" inertias.append(kmeans.inertia_)\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация метода локтя\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"plt.plot(clusters_range, inertias, marker='o')\n",
|
|||
|
"plt.title('Метод локтя для оптимального k')\n",
|
|||
|
"plt.xlabel('Количество кластеров')\n",
|
|||
|
"plt.ylabel('Инерция')\n",
|
|||
|
"plt.grid(True)\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### Расчет коэффициентов силуэта"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 30,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA2MAAAImCAYAAADe01JiAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACehUlEQVR4nOzdeVhUZf8G8HsYhn1H2URlUcENN8BcQMxSf1qpaFku5V6mkRuYReaWWm65pGWpqaWW4ZKvS+auaQguuSAoCgYiiyD7MsDM7w9ichxQBud4WO7PdXEJ5zznzPd8o/f17jnnORKlUqkEERERERERPVd6YhdARERERERUHzGMERERERERiYBhjIiIiIiISAQMY0RERERERCJgGCMiIiIiIhIBwxgREREREZEIGMaIiIiIiIhEwDBGREREREQkAoYxIiIiIiIiETCMERE9YuTIkRg5cqTatsjISLz22mto1aoVdu7cKejnr169Gh4eHlofV1HdRFSz8d9bItIXuwAioposPT0d7733Hlq3bo0NGzZUKygRERERVYRhjIjoCTZt2oTCwkJ8+eWXsLe3F7scIiIiqkN4myIRUSUePnyIbdu24dVXX9UIYvHx8QgKCkK3bt3Qvn17jBw5EhcuXFAbc+nSJQwfPhwdOnSAr68vpk+fjpSUFLUxv//+O/r06YO2bdti7NixSE1NBQDs3LkTPXr0QPv27TF16lRkZWWpjlEqlVi1ahW6du2KDh06YO7cuSguLkZxcTE+//xzeHt7o0uXLli5ciWUSqXqOA8PD6xevVrtPG+++SY8PDyQmJgIAPjoo4/w4osvqtWYmJgIDw8P7Nq1q8KfAaCoqAi9evXSmDk8cuQIAgMD0bZtW3Tr1g0LFixAfn6+an9lt2WW11r+WRV9ldf5tFu9Krqmx+Xm5mL+/Pnw8/ND+/btMXjwYJw4cUK1/8UXX8RHH32kdsy0adPg4eGB8PBwtXFPqvWLL76Al5cXcnJy1M61du1adOrUCQUFBSgqKsKyZcvQs2dPtGnTBq+88gr27NkDAAgPD6+0H+U9yMjIwNy5c1XH+/r6YtKkSap/xpUpLi6u9NyPGjly5FPH3bp1C2PHjoWPj88Tz/Wo8n/W+/fvx3vvvYd27dohICAAX3/9NRQKhWpcYWEhli1bht69e6NNmzbo2LEjRo8ejRs3bqjGPHz4ENOmTYOvr6/q966oqEi1//F/FwDN30Vtfq+uXbuG1q1bq/2OpKeno0uXLhg9erTav4dPcvr0abRp0waffPJJlY8hotqNM2NERI9RKpW4f/8+FixYgJKSErz77rtq+2NjY/HGG2/AxcUFoaGhkMlk2LJlC9555x1s3LgRvr6+uHPnDsaMGQNvb2+sWrUKycnJWLZsGUaOHIndu3fD1NQU165dw9SpU/F///d/+OSTT/DHH3/gl19+AQCsWbMGs2bNQkFBARYvXoxPPvkEa9asAQBs3rwZ69atw+TJk9G6dWt89913uHTpEgBAJpNh2bJluHr1Kr7++mtYW1vj7bffrvA69+7dqzruWX3//fcaf9nft28fZsyYgVdffRVTpkzBvXv3sGLFCsTGxmLTpk2QSCRPPa+dnR1+/vlnAMCJEyewbt06rFmzBg0bNoSBgYFOai8tLcWYMWNUAdvNzQ27d+/GpEmTsHnzZnh7e2scExkZif3791d4vh49euD9999X/bx27VrExsYCAIYMGYKNGzfi0KFDeP3111Vj9u7di379+sHY2BihoaHYv38/ZsyYgSZNmmDPnj2YOXMmCgsL8corr6j6sXPnTvz666+qn83MzKBUKvHuu+8iKysLM2bMQIMGDRATE4OvvvoKn332GTZs2FBpH8rDyrp162BjY6P2GY9r1aoVPvvsM9XPj4+bOHEiDAwMMH/+fNjZ2UFPT6/Scz1uzpw56NGjB1avXo0LFy5gzZo1yM/PR3BwMAAgJCQEkZGRmDZtGpo0aYK7d+9i5cqVmD59Ovbv3w+JRIJZs2bh6tWrmDt3LuRyOebMmQMjIyPMmDHjqZ9fHW3atMH48eOxbt06DBgwAF26dMHs2bOhUCiwePHiKv2uR0REYPLkyXj11VexYMGCKh1DRLUfwxgR0WMiIiIQEBAAmUyG7777Di4uLmr716xZAwMDA2zZsgVmZmYAgICAALzyyiv48ssv8euvv+LQoUPQ19fHqlWrYGxsDABwdHTE2LFjsWvXLowcORLfffcdGjVqhCVLlkBPTw/+/v6IiIhAXFwcPv74Y/Tp0wcAIJFI8NFHH+H27dtwcXHB999/j4EDB2LSpEkAgJYtWyIgIAASiQQrVqxAgwYN0KNHD9y7dw/ff/89hg8fDqlUqnYNeXl5WLp0KVq3bo3r168/U7/u37+P7777Tu1cSqUSS5cuhZ+fH5YuXaoa6+LiglGjRuHkyZMICAh46rkNDAzQvn17AMCdO3dU1+vs7PxMNT/q1KlT+Pvvv/H111/jpZdeAgC88MILSEhIwF9//aURxhQKBRYsWFBp72xsbFQ1l/9czt3dHR06dMDevXtVYezixYuIj4/H4sWLkZ+fj4MHD+K9997D8OHDAQDdu3fHvXv3sHLlSgwZMkR17tOnTwOA2melpKTA2NgYM2fOVNXduXNn/PPPP6rQVpnyGcsOHTrA2tpa7TMeZ2Zmpva5j47LyMhAQkICPv30U/Tt27fCMU/SunVr1e+Mv78/8vPzsXnzZlXAy8vLQ2hoKPr16wcA8PX1RW5uLhYvXowHDx7A0tISZmZm+Oyzz9C7d28AwNGjR3H69GnBwhgATJo0CceOHcPcuXMxYcIEHDlyBCtXrqzS7c1XrlzBu+++iz59+uDzzz9nECOqR3ibIhHRY1q1aoXFixfD0tISs2bN0pjxOX/+PHr27KkKYgCgr6+P/v3749q1a8jLy8P777+PM2fOwNjYGCUlJSgpKUHnzp1hY2ODiIgIAGV/AevatSv09P77n2JfX18AUAsA3bp1g1KpxJUrV5CUlIS0tDT4+fmp9tvb26Np06ZwcXFBgwYNVNu7d++OlJQUJCcna1zj2rVrYW1tjbfeeusZu1V26523tzd69uyp2nbnzh0kJyfjxRdfVF1/SUkJfHx8YGZmhj///FPtHI+OKSkp0boGpVKJkpIStdvZqurChQuQyWRqtzLq6elhx44dmDx5ssb4HTt2IC0tTRWGtTV48GBERkbi3r17AIDdu3fD1dUVHTp0gImJCSIiIjB69GgoFAqUlJSgtLQU/fv3R0ZGBuLi4p54bnt7e2zZsgWdOnVCYmIi/vzzT2zduhUXL16EXC5/4rHJycnQ09NT+72uDmtrazRp0gT/+9//EBMTg/z8fK3+2QwcOFDt5z59+qC4uBiXLl2CgYEBNmzYgH79+iElJQV//fUXduzYgePHjwMA5HI5DAwMsHTpUvTu3RslJSWIj4/H5cuX4e7urnbe8v6Wf1VUnza/VzKZDF988QUSExPxySefYNCgQWphtDJJSUkYP348lEolZs+erfa/B0RU93FmjIjoMWZmZhg0aBDc3Nzw1ltvYcqUKfj5559Vs0tZWVlqoadcgwYNoFQqkZubC1NTUxgaGgIAevfurfqLNwBkZ2cDANLS0lQzEOXKf7ayslJts7S0VI1PS0tTG1fOyspK4y9xFhYWAIAHDx6gUaNGqu3x8fHYvHkzvv/+eyQlJVWxKxU7f/48jhw5gt9++03ttr3MzEwAwNy5czF37lyN48qfjSvXunXrZ6ojIiJCdQ5ra2u0adMGkydPVpu9qUxmZmaF/ats7MqVKxESElLt0NKvXz8sXLgQe/fuxdixY3Hw4EFMmDBBtV9PTw8GBgbYtWsXZs2apXbso88OVua3337D8uXLcf/+fVhZWaFly5YwMjJ66nGJiYmws7ODTCbT/qIeIZFIsGbNGsyePRsDBgzQ+tmnx2eSymcWy6/99OnTWLhwIe7cuQNTU1N4enrCxMQEADQ+a/DgwYiOjoZEItF4/mvt2rVYu3btE2vR9veqZcuW8PDwwLVr19T+48STJCYmonv37ggPD8eaNWs0nk0
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Список для хранения коэффициентов силуэта\n",
|
|||
|
"silhouette_scores = []\n",
|
|||
|
"\n",
|
|||
|
"# Вычисление коэффициентов силуэта для каждого количества кластеров\n",
|
|||
|
"for i in clusters_range[1:]: \n",
|
|||
|
" kmeans = KMeans(n_clusters=i, random_state=random_state)\n",
|
|||
|
" labels = kmeans.fit_predict(data_scaled)\n",
|
|||
|
" score = silhouette_score(data_scaled, labels)\n",
|
|||
|
" silhouette_scores.append(score)\n",
|
|||
|
"\n",
|
|||
|
"# Построение диаграммы значений силуэта\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"plt.plot(clusters_range[1:], silhouette_scores, marker='o')\n",
|
|||
|
"plt.title('Коэффициенты силуэта для разных k')\n",
|
|||
|
"plt.xlabel('Количество кластеров')\n",
|
|||
|
"plt.ylabel('Коэффициент силуэта')\n",
|
|||
|
"plt.grid(True)\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 32,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Средний коэффициент силуэта: 0.678\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA1MAAAJzCAYAAADqY0keAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3xUVfrH8c+ZmkaTXqQrRXpHBQRsa13Q/SkqiGUBBde6llXXCpbFCiggqyisyloX27piY7EhKsgKgoAgvfckU8/vj0sGhiSQGZJMQr7v14sXzD137n1ymMB9cs55jrHWWkRERERERCQhrlQHICIiIiIiUh4pmRIREREREUmCkikREREREZEkKJkSERERERFJgpIpERERERGRJCiZEhERERERSYKSKRERERERkSQomRIREREREUmCkikREREREZEkKJkSkaQNHjyYFi1axP3q0qULQ4YMYe7cuakOT0SOci1atGDcuHH5ji9dupSePXvSp08fVq5cWej7x40bR4sWLWjbti179uwp8JxXXnmFFi1a0K9fv+IKW0SOIkqmROSItG7dmhkzZjBjxgxefvllHn74YbxeL1dddRW//PJLqsMTkQrml19+YejQoaSnpzN9+nQaN2582PeEw2E++eSTAtvef//9Yo5QRI4mSqZE5IhkZWXRoUMHOnToQOfOnTn11FMZN24cLpeLN998M9XhiUgFsnz5ci6//HIyMzOZPn06xx57bJHe16lTJz744IN8xzdu3Mi8efNo1apVcYcqIkcJJVMiUuzS09Px+/0YY2LHBg8ezODBg+POe+yxx2jRokVc0jV9+nT69+9Px44dueyyy1i6dCkA//jHP2jRogW//vpr3DX+9a9/0apVK9avXw/ArFmzuOSSS+jYsSNt2rThzDPP5B//+Efce26//fZ80xPzfq1ZsyZ2zsHTel599dV804ref/99zjrrLDp06MDAgQOZN29e3HsOF88333xDixYt+Oabb+Led3B/FaX/gsEgjzzyCH369KFVq1ZxX9ehEtuDrz169Gjatm3L7Nmzgf1ToQr6dWDcRen7TZs2cdttt9GzZ8/Y3/EPP/wAQL9+/Q779zJv3jwuu+wy2rdvT7du3bjtttvYtm1b7PpvvvkmLVq0YMGCBQwYMIB27dpx7rnn8u9//zsujt27d/PQQw9x6qmn0rZtW8455xxef/31uHMOjKdly5Z07dqV6667ju3btxfalwArVqxg1KhRdOvWja5duzJ8+HCWL19e6PmH6t8D/95WrlzJn/70J0466SQ6dOjA4MGD+e6772Lta9asib1v5syZcff49NNPY20Hev/99xk4cCAdO3bkpJNO4q9//Ss7d+7MF9uBCvos9uvXj9tvv73Q1wfLi/XAr+/777/noosuom3btpx00kk88MAD5ObmFnqNgy1fvpwhQ4ZQqVIlpk+fTr169Yr83rPOOos5c+bkm+r373//myZNmtCyZct875k1axYDBw6Mxfvggw+SnZ2d75yifP9/9dVXXHnllbRv356TTjqJv/3tb0Qikdh5X3zxBf/3f/9Hx44d6dq1K9dcc80hP1MiUnqUTInIEbHWEg6HCYfDhEIhNm/ezGOPPUYwGOSCCy4o9H2//fYbU6dOjTv2n//8hwceeICzzz6bCRMmEIlEGDFiBMFgkHPPPRe/38+//vWvuPe8/fbb9OzZk7p16/LZZ58xcuRITjjhBJ555hnGjRvHsccey/3338+CBQvi3lezZs3Y9MQZM2ZwzTXXHPLr3LlzJ08++WTcsR9//JFbbrmFDh068Oyzz1K3bl1GjBjBli1bABKKJ1EF9d9zzz3Hiy++yOWXX86LL77IjBkzGD9+fELX/fHHH3nllVd48skn6dixY1zbgf3117/+Na6tKF/r3r17GTRoEN988w1//vOfGT9+PH6/nyuvvJKVK1cyfvz4uJivueaa2P1q1arFt99+y9ChQ0lLS+PJJ5/kL3/5C3PnzmXIkCH5HrqHDx9O//79GT9+PE2aNOGGG27g888/ByA3N5dLLrmEd955h6uvvppnnnmGzp07c+eddzJx4sS46/Tp04cZM2Ywbdo0br75Zr744gtGjx5daP9t3LiRiy66iJUrV3Lvvffyt7/9jS1btnD55ZezY8eOQ/b9gf178N/bsmXLGDhwIGvWrOGuu+5i7NixGGO4/PLL861PzMzMzDdl7f3338fliv8v/5lnnuGmm26iQ4cOPP3004wcOZIPP/yQwYMHJ5TEFIf169dz1VVXUa1aNcaPH8+f/vQn/vWvf3HrrbcW6f0rVqzg8ssvJysri+nTp1O7du2E7n/GGWcQiUQK7Lezzz473/nvvPMOI0eOpGnTpkyYMIFRo0Yxc+ZMrr32Wqy1QGLf/7fccgudO3dm4sSJnHPOOUyZMoXXXnsNgNWrV3PttdfSpk0bnn32WUaPHs2vv/7KsGHDiEajCX2dIlL8PKkOQETKt2+//ZYTTjgh3/GbbrqJZs2aFfq+MWPGcNxxx/HTTz/Fjm3bto1LLrmEm266CXBGWvJ+qt+qVStOO+00Zs6cyfXXX48xhg0bNvD111/zt7/9DXAeOAcMGMCdd94Zu2bHjh3p3r0733zzDe3bt48d9/l8dOjQIfZ6xYoVh/w6n376aerVqxc3KrFhwwbOOOMMHnzwQVwuFzVq1OCcc85h/vz5nHrqqQnFk6iC+u/HH3+kZcuWXHnllbFjeSM6RZU3Mti/f/98bQf2VyAQiGsrytf61ltvsXbtWt56663YtKlOnTrx+9//nm+//ZY//OEPcTE3bNgw7p6PPfYYTZo0YdKkSbjdbgDat2/P2WefzRtvvMGll14aO3fw4MGMHDkSgF69ejFgwAAmTJhAnz59ePPNN1m6dCmvvvpqLGHs1asX4XCYZ555hosvvpiqVasCcMwxx8Ri6Nq1K19++WVcnx9s6tSpBINBXnjhBWrWrAlAy5YtGTRoEAsWLKBPnz6FvvfAr/Xgv7fx48fj8/l46aWXyMrKAuCUU07hnHPO4dFHH40bVevduzf//e9/CQaD+Hw+AoEAH3/8MV27do2NJO7cuZNnn32W//u//4tLjI8//nguvfTSfP1Z0p577jmqVavGhAkTYn+3LpeLu+66iyVLluQbHTvQypUrGTJkCFu2bCEUCiWVYNSoUYOuXbvywQcfcN555wGwdu1aFixYwKOPPsqzzz4bO9day9ixY+nVqxdjx46NHW/cuDFDhw7l888/55RTTkno+/8Pf/hD7PPas2dPZs2axWeffcbFF1/Mjz/+SG5uLsOHD48liXXq1OHjjz8mOzs79nkQkdRQMiUiR+SEE07gvvvuA5yHjF27djF79myeeOIJsrOzufHGG/O9Z/bs2Xz55Zc899xzDBkyJHb84osvBiAajZKdnc1//vMf0tLSqF+/PgAXXngh7777LvPmzaNr1668/fbbZGZmctpppwFw9dVXA84IyK+//spvv/3GwoULAScxS9bSpUtjoxN5MQKcfvrpnH766Vhryc7O5oMPPsDlctGkSZMSjaew/mvbti2TJ0/mww8/pEePHmRmZhb5wdJayw8//MD777+fb8SrKIrytX733Xc0aNAgbv1Jeno6H3744WGvn5OTw4IFC7jqqqtio6EAxx57LM2aNeOLL76Ie/gfMGBA7M/GGE477TTGjRtHbm4uc+fOpX79+vlG3s477zxef/31uKQn717RaJSff/6Z7777jhNPPLHQOL/77js6dOgQS6TAefD99NNPD/s1HsrcuXPp27dv3IOzx+OJjeLu3bs3drxHjx7Mnj2bb775hl69ejF79myysrLo0qVLLJmaP38+wWCQc845J+4+Xbp0oX79+sydO/eIk6m8vnO5XPlGxfJEo1HC4TDz5s3j5JNPjiVS4CSF4PTpoZKpd999lzZt2vDEE09w5ZVX8uc//5mpU6fG3TMSicRGjMD5TBx4L3Cm+j344IPs2bOHrKws3nvvPU444QQaNWoUd96KFSvYsGEDw4cPj30OwUm2s7Ky+OKLLzjllFMS+v4/+LNYp06d2JTB9u3
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x700 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Добавляем индекс строки как дополнительный признак\n",
|
|||
|
"data_scaled_with_index = np.hstack((data_scaled, np.arange(data_scaled.shape[0]).reshape(-1, 1)))\n",
|
|||
|
"\n",
|
|||
|
"# ========================\n",
|
|||
|
"# Применение K-Means\n",
|
|||
|
"# ========================\n",
|
|||
|
"kmeans = KMeans(n_clusters=3, random_state=42) \n",
|
|||
|
"df_clusters = kmeans.fit_predict(data_scaled)\n",
|
|||
|
"\n",
|
|||
|
"# ========================\n",
|
|||
|
"# Оценка качества кластеризации\n",
|
|||
|
"# ========================\n",
|
|||
|
"silhouette_avg = silhouette_score(data_scaled, df_clusters)\n",
|
|||
|
"print(f'Средний коэффициент силуэта: {silhouette_avg:.3f}')\n",
|
|||
|
"\n",
|
|||
|
"# ========================\n",
|
|||
|
"# Визуализация кластеров\n",
|
|||
|
"# ========================\n",
|
|||
|
"pca = PCA(n_components=2)\n",
|
|||
|
"df_pca = pca.fit_transform(data_scaled_with_index)\n",
|
|||
|
"\n",
|
|||
|
"plt.figure(figsize=(10, 7))\n",
|
|||
|
"sns.scatterplot(x=df_pca[:, 0], y=df_pca[:, 1], hue=df_clusters, palette='viridis', alpha=0.7)\n",
|
|||
|
"plt.title('Визуализация кластеров с помощью K-Means')\n",
|
|||
|
"plt.xlabel('Первая компонента PCA')\n",
|
|||
|
"plt.ylabel('Вторая компонента PCA')\n",
|
|||
|
"plt.legend(title='Кластер', loc='upper right')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Средний коэффициент силуэта, равный 0.678, указывает на хорошую кластеризацию. \n",
|
|||
|
"\n",
|
|||
|
"Средний коэффициент силуэта (silhouette score) указывает на качество кластеризации, измеряя, насколько хорошо точки внутри одного кластера близки друг к другу по сравнению с точками из других кластеров. Значения коэффициента силуэта находятся в диапазоне от -1 до 1:\n",
|
|||
|
"\n",
|
|||
|
"1: Указывает на идеально плотные и четко разделенные кластеры. \n",
|
|||
|
"0: Указывает на перекрытие кластеров или слабую структуру кластеризации. \n",
|
|||
|
"Отрицательные значения: Указывают, что точки в кластере расположены ближе к другому кластеру, чем к своему."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Средний коэффициент силуэта (агломеративная кластеризация): 0.724\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA1MAAAJzCAYAAADqY0keAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3iTVf8G8Du7TdK9d0sLpYWW1UJBNjheURT0fRWZKgIKKCgvyk9REEFEQGUvlSWCCiIovig4EET2phRoaUv33mn27w9sbGgLTWhJS+/PdXFBn/MkzzenIcmdc57zCIxGoxFERERERERkEaGtCyAiIiIiImqOGKaIiIiIiIiswDBFRERERERkBYYpIiIiIiIiKzBMERERERERWYFhioiIiIiIyAoMU0RERERERFZgmCIiIiIiIrICwxQREREREZEVxLYugKgxjBw5EkePHjXb5uDggMjISEyaNAldu3a1UWVERNRcvf/++ygvL8drr72GhIQEvPzyyzh8+DBEIpGtSyMiG2GYontWZGQk3nnnHQCAXq9HYWEhvvzySzz//PPYsWMHWrdubeMKiYioORkzZgxGjBiBuLg4SCQSzJw5k0GKqIUTGI1Go62LIGpoI0eOBABs2rTJbHtFRQW6d++OZ555Bq+//rotSiMiomZMq9UiNTUVbm5ucHZ2tnU5RGRjPGeKWhR7e3vIZDIIBALTtpEjR5rCV5VFixYhPDwcO3bsMG3bvHkzBgwYgE6dOmHEiBG4fPkyAOCLL75AeHg4rl27ZnYf3333HSIiIpCZmQkA2LdvH5555hl06tQJ7du3x0MPPYQvvvjC7DZvvPEGwsPDa/2TlpZm2qd///5mt9u6dSvCw8OxdOlS07Y9e/bg4YcfRseOHTF06FAcP37c7Da3q+fIkSMIDw/HkSNHzG53c3/Vp/80Gg0++OAD9OnTBxEREWaPq3of3+zm+547dy6ioqJw4MABAMDSpUvr7K/qdden73NycvD666+je/fupt/xqVOnAAD9+/e/7e/l+PHjGDFiBDp06ICuXbvi9ddfR0FBgen+d+zYgfDwcJw5cwZDhgxBdHQ0Hn30Ufzvf/8zq6O0tBTvv/8+Bg4ciKioKDzyyCP45ptvzPapXk/btm0RGxuLyZMno7CwsM6+BICkpCTTNNfY2FiMHz8eiYmJde5/q/6t/ntLTk7Gyy+/jPvuuw8dO3bEyJEjceLECVN7Wlqa6Xa7du0yO8avv/5qaqtuz549GDp0KDp16oT77rsPb7/9NoqLi2vUVl1tz8X+/fvjjTfeqPPnm1XVWv3xnTx5Ek899RSioqJw3333Yc6cOaisrKzzPqruZ/r06ejZsyfatWuH7t27Y/r06Wa/o9qeV2lpafV+Xufk5GDGjBno06cPoqOj8eSTT2L//v1mdVTdbuXKlWbbL1++XOM5DDTc8/hWj7/68+HmP1WvbfV5XamqpepP+/bt8eCDD5o9x2p7nlT1S/XXy/r25dKlSyGRSBAaGgonJyc8/fTTNfrwVscqLy/HyJEjERkZCbVabXqsdfVHFb1ejzVr1uCRRx5BdHQ0OnbsiKeffhp//fWX2bFOnz6N5557Dp07d0ZcXBxeffVVZGdn16vPAeDrr7/GoEGD0L59e/Tt2xdLly6FXq83tb/xxhsYOXIkvvnmG/Tr1w+dOnXC6NGjcenSJdM+Vb+X6n1y5coVtGvXzux3Gh8fj+HDh6NTp04YOHAgtm7davZYLl26hEmTJiEuLg7t2rVDr1698N5775n937v59wjU/J3X9hz4448/EB4ebnotqO3/vVqtxoABA2p9/hBVxzBF9yyj0QidTgedTgetVovc3FwsWrQIGo0GTzzxRJ23S01Nxfr16822/fTTT5gzZw4GDRqE5cuXQ6/XY8KECdBoNHj00Uchk8nw3Xffmd1m586d6N69O3x8fPDbb79h4sSJaNeuHVasWIGlS5ciICAA7777Ls6cOWN2Ow8PD2zbts3058UXX7zl4ywuLsbHH39stu3s2bOYNm0aOnbsiJUrV8LHxwcTJkxAXl4eAFhUj6Vq67+1a9diw4YNGD16NDZs2IBt27Zh2bJlFt3v2bNn8eWXX+Ljjz9Gp06dzNqq99fbb79t1lafx1peXo5hw4bhyJEj+O9//4tly5ZBJpPhueeeQ3JyMpYtW2ZW84svvmg6nqenJ44dO4YxY8bAzs4OH3/8Mf7v//4PR48exahRo2p86B4/fjwGDBiAZcuWISQkBFOmTMHvv/8OAKisrMQzzzyD3bt3Y+zYsVixYgW6dOmCN998E6tWrTK7nz59+mDbtm3YtGkTXnvtNRw6dAhz586ts/+ys7Px1FNPITk5GbNmzcKHH36IvLw8jB49GkVFRbfs++r9e/Pv7erVqxg6dCjS0tLw1ltvYeHChRAIBBg9enSN8xYVCgV++eUXs2179uyBUGj+VrRixQq8+uqr6NixI5YsWYKJEydi7969GDly5G1DTEPLzMzE888/DxcXFyxbtgwvv/wyvvvuO0yfPr3O26hUKowaNQqJiYl455138Omnn2LUqFH44Ycf8NFHH5ntW/V7rP58qnKr53VeXh6efPJJHD9+HFOnTsXSpUvh5+eHiRMn1gis9e33hnoe3+7xe3p61nh9u/n/2M1qe12pUnXb5cuXo1WrVnj99ddrfLl1K5b0ZXXfffed6QuX+tqyZQvy8vKwYcMGSKVS0/bIyEiz3/eTTz5pdruFCxdixYoVeOqpp7Bu3TrMmTMHRUVFeOWVV6BSqQAAFy9exIgRI6BWq7FgwQLMnj0b58+fx/PPP1+vPl+9ejVmzpyJ7t27Y9WqVRg+fDjWrl2LmTNnmtUSHx+Pjz76CJMmTcKHH36IwsJCjBgxAjk5OXU+7rlz50Kn05l+VqlUeOGFF6DT6bB06VIMHjwY77zzjumLspycHAwfPhwqlQrz58/H2rVrMWjQIGzatAkbN260qM9vptVqMW/evNvut27duluGZKIqPGeK7lnHjh1Du3btamx/9dVXERoaWuft5s2bh9atW+PChQumbQUFBXjmmWfw6quvArgx0lL1rX5ERATuv/9+7Nq1C6+88goEAgGysrLw119/4cMPPwRw4wPnkCFD8Oabb5rus1OnTujWrRuOHDmCDh06mLZLpVJ07NjR9HNSUtItH+eSJUvg6+tr9o13VlYWHnzwQbz33nsQCoVwd3fHI488gtOnT2PgwIEW1WOp2vrv7NmzaNu2LZ577jnTNkvfpKpGBgcMGFCjrXp/VX3bW6U+j/Xbb79Feno6vv32W0RERAAAOnfujMcffxzHjh3Dv//9b7OaAwMDzY65aNEihISEYPXq1abzJzp06IBBgwZh+/btGD58uGnfkSNHYuLEiQCAXr16YciQIVi+fDn69OmDHTt24PLly9i6daspMPbq1Qs6nQ4rVqzA008/bZpW5OrqaqohNjYWf/75p1mf32z9+vXQaDT4/PPP4eHhAQBo27Ythg0bhjNnzqBPnz513rb6Y73597Zs2TJIpVJs3LgRSqUSANC3b1888sgjWLBggdmoWu/evfHHH39Ao9FAKpVCrVZj//79iI2NNY24FBcXY+XKlfjPf/5jFiDatGmD4cOH1+jPxrZ27Vq4uLhg+fLlpt+tUCjEW2+9hYSEhFq/tU5OToa3tzc++OADBAQEAADi4uJw5syZGgGz+u/xZrd6Xn/++ecoKCjA3r174efnB+BGMBszZgwWLFiARx55xBSWevfujf/973/IyckxhbUff/zRrN+Bhnse3+7xV3+Nq3p9i4iIgL+/f639ANT+ulKl+m19fHzwyy+/ID4+HiEhIXXen7V9WaW8vBwLFy5Eu3btbvn/rjq9Xm86bzc2NtasTalUmv2+//jjD7P2nJwcTJ061WxkRyaTYfLkyUhISEDHjh2xatUqODs747PPPoNMJgMAeHp64rXXXkNiYuIt+7y0tNQU1t566y0AQM+ePeHs7Iy33noLzz77rOk849LSUqxatQoxMTEAgOjoaAwcOBAbN27EtGnTajzuvXv34syZM2a/j/T0dER
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x700 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Добавляем индекс строки как дополнительный признак\n",
|
|||
|
"data_scaled_with_index = np.hstack((data_scaled, np.arange(data_scaled.shape[0]).reshape(-1, 1)))\n",
|
|||
|
"\n",
|
|||
|
"# ========================\n",
|
|||
|
"# Агломеративная кластеризация\n",
|
|||
|
"# ========================\n",
|
|||
|
"agg_cluster = AgglomerativeClustering(n_clusters=3) \n",
|
|||
|
"labels_agg = agg_cluster.fit_predict(data_scaled)\n",
|
|||
|
"\n",
|
|||
|
"# ========================\n",
|
|||
|
"# Оценка качества кластеризации\n",
|
|||
|
"# ========================\n",
|
|||
|
"silhouette_avg_agg = silhouette_score(data_scaled, labels_agg)\n",
|
|||
|
"print(f'Средний коэффициент силуэта (агломеративная кластеризация): {silhouette_avg_agg:.3f}')\n",
|
|||
|
"\n",
|
|||
|
"# ========================\n",
|
|||
|
"# Визуализация кластеров\n",
|
|||
|
"# ========================\n",
|
|||
|
"pca = PCA(n_components=2)\n",
|
|||
|
"df_pca = pca.fit_transform(data_scaled_with_index)\n",
|
|||
|
"\n",
|
|||
|
"plt.figure(figsize=(10, 7))\n",
|
|||
|
"sns.scatterplot(x=df_pca[:, 0], y=df_pca[:, 1], hue=labels_agg, palette='viridis', alpha=0.7)\n",
|
|||
|
"plt.title('Визуализация кластеров с помощью агломеративной кластеризации')\n",
|
|||
|
"plt.xlabel('Первая компонента PCA')\n",
|
|||
|
"plt.ylabel('Вторая компонента PCA')\n",
|
|||
|
"plt.legend(title='Кластер', loc='upper right')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Значение коэффициента силуэта лежит в диапазоне от -1 до 1. Ближе к 1: Хорошо сформированные, плотные кластеры, четко отделенные друг от друга. \n",
|
|||
|
"\n",
|
|||
|
"Ближе к 0: Кластеры пересекаются или слабо разделены, не имеют четких границ. Точки расположены одинаково близко как к своему кластеру, так и к соседним. \n",
|
|||
|
"Ближе к -1 (Отрицательные значения): Некоторые точки скорее относятся к другим кластерам, чем к текущему (ближе к центрам других кластеров). Очень плохая кластеризация. \n",
|
|||
|
"Ближе к 1: Все точки внутри каждого кластера плотно сгруппированы и значительно удалены от точек других кластеров. Свидетельствует о четкой и хорошо разделенной структуре данных. Единица говорит об идеальной кластеризации.\n",
|
|||
|
"\n",
|
|||
|
"Средний коэффициент силуэта, равный 0.724, указывает на то, что кластеры имеют хорошее разделение и четкие границы. Точки внутри каждого кластера достаточно плотно сгруппированы и значительно удалены от точек других кластеров, что свидетельствует о четкой и хорошо разделенной структуре данных."
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": "Python 3",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.9.7"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|