540 lines
802 KiB
Plaintext
540 lines
802 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# Цель работы- разделить данные на кластеры\n",
|
|||
|
"## Кластеры будут содержать миллиардеров с похожими данными о себе, возможно это поможет тем, кто анализирует данные об этих людях"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# Загрузка датасета "
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 1,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>Rank</th>\n",
|
|||
|
" <th>Name</th>\n",
|
|||
|
" <th>Networth</th>\n",
|
|||
|
" <th>Age</th>\n",
|
|||
|
" <th>Country</th>\n",
|
|||
|
" <th>Source</th>\n",
|
|||
|
" <th>Industry</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>Elon Musk</td>\n",
|
|||
|
" <td>219.0</td>\n",
|
|||
|
" <td>50</td>\n",
|
|||
|
" <td>United States</td>\n",
|
|||
|
" <td>Tesla, SpaceX</td>\n",
|
|||
|
" <td>Automotive</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>Jeff Bezos</td>\n",
|
|||
|
" <td>171.0</td>\n",
|
|||
|
" <td>58</td>\n",
|
|||
|
" <td>United States</td>\n",
|
|||
|
" <td>Amazon</td>\n",
|
|||
|
" <td>Technology</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>Bernard Arnault & family</td>\n",
|
|||
|
" <td>158.0</td>\n",
|
|||
|
" <td>73</td>\n",
|
|||
|
" <td>France</td>\n",
|
|||
|
" <td>LVMH</td>\n",
|
|||
|
" <td>Fashion & Retail</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>Bill Gates</td>\n",
|
|||
|
" <td>129.0</td>\n",
|
|||
|
" <td>66</td>\n",
|
|||
|
" <td>United States</td>\n",
|
|||
|
" <td>Microsoft</td>\n",
|
|||
|
" <td>Technology</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>5</td>\n",
|
|||
|
" <td>Warren Buffett</td>\n",
|
|||
|
" <td>118.0</td>\n",
|
|||
|
" <td>91</td>\n",
|
|||
|
" <td>United States</td>\n",
|
|||
|
" <td>Berkshire Hathaway</td>\n",
|
|||
|
" <td>Finance & Investments</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2595</th>\n",
|
|||
|
" <td>2578</td>\n",
|
|||
|
" <td>Jorge Gallardo Ballart</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>80</td>\n",
|
|||
|
" <td>Spain</td>\n",
|
|||
|
" <td>pharmaceuticals</td>\n",
|
|||
|
" <td>Healthcare</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2596</th>\n",
|
|||
|
" <td>2578</td>\n",
|
|||
|
" <td>Nari Genomal</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>82</td>\n",
|
|||
|
" <td>Philippines</td>\n",
|
|||
|
" <td>apparel</td>\n",
|
|||
|
" <td>Fashion & Retail</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2597</th>\n",
|
|||
|
" <td>2578</td>\n",
|
|||
|
" <td>Ramesh Genomal</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>71</td>\n",
|
|||
|
" <td>Philippines</td>\n",
|
|||
|
" <td>apparel</td>\n",
|
|||
|
" <td>Fashion & Retail</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2598</th>\n",
|
|||
|
" <td>2578</td>\n",
|
|||
|
" <td>Sunder Genomal</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>68</td>\n",
|
|||
|
" <td>Philippines</td>\n",
|
|||
|
" <td>garments</td>\n",
|
|||
|
" <td>Fashion & Retail</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2599</th>\n",
|
|||
|
" <td>2578</td>\n",
|
|||
|
" <td>Horst-Otto Gerberding</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>69</td>\n",
|
|||
|
" <td>Germany</td>\n",
|
|||
|
" <td>flavors and fragrances</td>\n",
|
|||
|
" <td>Food & Beverage</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>2600 rows × 7 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" Rank Name Networth Age Country \\\n",
|
|||
|
"0 1 Elon Musk 219.0 50 United States \n",
|
|||
|
"1 2 Jeff Bezos 171.0 58 United States \n",
|
|||
|
"2 3 Bernard Arnault & family 158.0 73 France \n",
|
|||
|
"3 4 Bill Gates 129.0 66 United States \n",
|
|||
|
"4 5 Warren Buffett 118.0 91 United States \n",
|
|||
|
"... ... ... ... ... ... \n",
|
|||
|
"2595 2578 Jorge Gallardo Ballart 1.0 80 Spain \n",
|
|||
|
"2596 2578 Nari Genomal 1.0 82 Philippines \n",
|
|||
|
"2597 2578 Ramesh Genomal 1.0 71 Philippines \n",
|
|||
|
"2598 2578 Sunder Genomal 1.0 68 Philippines \n",
|
|||
|
"2599 2578 Horst-Otto Gerberding 1.0 69 Germany \n",
|
|||
|
"\n",
|
|||
|
" Source Industry \n",
|
|||
|
"0 Tesla, SpaceX Automotive \n",
|
|||
|
"1 Amazon Technology \n",
|
|||
|
"2 LVMH Fashion & Retail \n",
|
|||
|
"3 Microsoft Technology \n",
|
|||
|
"4 Berkshire Hathaway Finance & Investments \n",
|
|||
|
"... ... ... \n",
|
|||
|
"2595 pharmaceuticals Healthcare \n",
|
|||
|
"2596 apparel Fashion & Retail \n",
|
|||
|
"2597 apparel Fashion & Retail \n",
|
|||
|
"2598 garments Fashion & Retail \n",
|
|||
|
"2599 flavors and fragrances Food & Beverage \n",
|
|||
|
"\n",
|
|||
|
"[2600 rows x 7 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 1,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"df = pd.read_csv(\"C://Users//annal//aim//static//csv//Forbes_Billionaires.csv\")\n",
|
|||
|
"df"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# Подготовим данные. Удалим неинформативные столбцы, преобразуем категориальные столбцы в числовые и нормализуем"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 2,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Удаляем неинформативные столбцы\n",
|
|||
|
"df = df.drop(columns=[\"Rank \", \"Name\"])\n",
|
|||
|
"\n",
|
|||
|
"# Преобразование категориальных данных в числовые с помощью one-hot encoding\n",
|
|||
|
"df = pd.get_dummies(df, drop_first=True)\n",
|
|||
|
"\n",
|
|||
|
"# Нормализация числовых данных\n",
|
|||
|
"from sklearn.preprocessing import StandardScaler\n",
|
|||
|
"\n",
|
|||
|
"scaler = StandardScaler()\n",
|
|||
|
"df_scaled = scaler.fit_transform(df)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# Визуализируем данные при помощи PCA\n",
|
|||
|
"## Метод PCA уменьшает количество измерений до двух, но сохраняет максимально возможное количество информации. Он находит новые оси в данных и потом проецирует их на эти оси. Вдоль этих осей разброс данных максимальный"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 3,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAroAAAIjCAYAAADslLiSAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOydd3hUVfrHP/dOT69AIKGGEJAWQ1ERgVVUFF0rrquCHfuydl0Vy7qI7lrWtbDuSrNSLFhAwRVsSA1ID6GmkDKTZJJMv/ee3x83yS+B0BQE4vk8zzyQO2fOnHvnlu95z1sUIYRAIpFIJBKJRCJpZajHegASiUQikUgkEsnRQApdiUQikUgkEkmrRApdiUQikUgkEkmrRApdiUQikUgkEkmrRApdiUQikUgkEkmrRApdiUQikUgkEkmrRApdiUQikUgkEkmrRApdiUQikUgkEkmrRApdiUQikUgkEkmrRApdiUQikUgkEkmrRArd3yjTpk1DUZTGl9PpJCsrizvuuIOysrJ92peVlXHvvfeSnZ1NVFQU0dHR5Obm8te//pXq6uoWv2PQoEEoisJrr732i8d77bXXNhuv1WolIyODP/zhD2zcuPEX9y+RSCQSiaT1YT3WA5AcW5588km6dOlCMBjku+++47XXXuPzzz9n/fr1REVFAbBixQrOO+886urquPrqq8nNzQVg5cqVPPPMM3zzzTd8+eWXzfrdunUrK1asoHPnzrz99tvceuutv3isDoeD//znPwBomsa2bdt4/fXXWbBgARs3bqR9+/a/+DskEolEIpG0HqTQ/Y0zatQoBgwYAMCNN95IcnIyzz//PB9//DFXXnkl1dXVXHzxxVgsFvLy8sjOzm72+aeffpo33nhjn37feust2rRpwz/+8Q8uu+wydu7cSefOnX/RWK1WK1dffXWzbaeccgqjR4/ms88+46abbvpF/UskEolEImldSNcFSTN+97vfAbBjxw4ApkyZQnFxMc8///w+Ihegbdu2PPLII/tsf+edd7jssssYPXo08fHxvPPOO/u08fv9bN68Gbfb/bPH265dO8AUwQ00uGXs3LmzcZthGPTt2xdFUZg2bVrj9scff5xevXoRExNDXFwcp5xyCh999FHj+xMnTsRms1FRUbHPd998880kJCQQDAYB+Pjjjzn//PNp3749DoeDbt268dRTT6Hr+j6f3blzZzNXjKavvds0HS/A7bffjqIoXHvttY3bPvjgAwYNGkRSUhIul4vs7GwmT56MEKKxza5du7jtttvo0aMHLpeL5ORkLr/88mbHqenxW7lyZbPtbrcbRVF4/PHHm21vadtzzz2HoigMHz682fbt27dz+eWX0759e1RVbdzn3r1773OMWmLx4sUHPW5Ha18ff/xxFEXZ53xduXLlPr/TtddeS0xMzAH3pWn/gUCA7OxssrOzCQQCjW0qKytJS0vjtNNOa/E82ns/9vfa+/fJy8tj1KhRxMXFERMTw5lnnsmPP/64T7/V1dX8+c9/pnPnzjgcDtLT0xk7duw+x2Bv16L9fe/mzZu57LLLSEpKwul0MmDAAObNm3fA49SAYRi89NJL9OnTB6fTSWpqKueee26z305RFO644459Pjt69OhmE+3DubamTJmCqqrMnj37gJ/Pz88nKSmJP/7xj836rK6uZsKECWRkZOBwOMjMzGTy5MkYhrFPf3//+9/3GXvv3r2bXUcN18DixYubtTv//PNbPOZff/01Q4cOJTExsdlv09JxakrD97z//vs8/PDDtGvXjujoaC688EIKCwubtf3222+5/PLL6dixIw6Hg4yMDP785z83O5fh4PdbgOHDh6MoChdddNE+Yxo/fnyL9wvDMHjxxRc56aSTcDqdtG3blvHjx1NVVdWsXefOnRk9ejRffvkl/fv3x+l00qtXLz744INm7Q73HngkrqcD3dv2vp4a7kVNqauro127di2eG791pEVX0oxt27YBkJycDMC8efNwuVxcdtllh9zHsmXLKCgoYOrUqdjtdi655BLefvttHn744Wbtli9fzogRI5g4ceI+N4790fCA1XWd7du388ADD5CcnMzo0aMP+LmZM2eybt26fbb7fD4uvvhiOnfuTCAQYNq0aVx66aUsXbqUQYMGcc011/Dkk0/y/vvvN3swhMNh5syZw6WXXorT6QTMm2NMTAx33303MTEx/O9//+Oxxx6jpqaG5557rsVx3XzzzQwdOhQwxeqHH354wP0oKCho0YJeU1PD4MGDGTduHDabjQULFvDggw9itVq55557ANMF5YcffuAPf/gD6enp7Ny5k9dee43hw4ezcePGRleVX0p1dTWTJk3aZ7uu61x44YXs2rWLCRMmkJWVhaIoPP3004f9HXfddRcDBw4EYMaMGSxcuLDZ+7/Wvh4pXC4X06dPZ8iQIfzlL3/h+eefB0zh5fV6mTZtGhaL5aD9NLgiNVBXV7eP29CGDRsYOnQocXFx3H///dhsNqZMmcLw4cNZsmQJgwcPbvzs0KFD2bRpE9dffz0nn3wybrebefPmUVRUREpKSrN+U1JSeOGFFxr/vuaaa/b53iFDhtChQwcefPBBoqOjmTVrFhdddBFz587l4osvPuC+3XDDDUybNo1Ro0Zx4403omka3377LT/++GPjqtQvYX/X1vjx49myZQvjxo2jc+fOjeddUyorKxk9ejQ9e/Zk6tSpjdv9fj/Dhg2juLiY8ePH07FjR3744Qceeugh9uzZw4svvviLxw3wzTff8Pnnn++zfceOHZx//vmkpaXx2GOPkZqaCuz72xyIp59+GkVReOCBBygvL+fFF1/krLPOYs2aNbhcLgBmz56N3+/n1ltvJTk5meXLl/Pyyy9TVFTUbIJwsPttA06nk88++4zy8nLatGkDmJPB999/v/F+25Tx48czbdo0rrvuOu666y527NjBv/71L/Ly8vj++++x2WyNbbdu3coVV1zBLbfcwrhx45g6dSqXX345CxYsYOTIkYd8XBo4UtdTz549mTlzZmO///73v9m0aVOza6pv3777Hcc//vGPFuNrJICQ/CaZOnWqAMSiRYtERUWFKCwsFO+9955ITk4WLpdLFBUVCSGESExMFP369Tusvu+44w6RkZEhDMMQQgjx5ZdfCkDk5eU1a/f1118LQEycOPGgfY4bN04A+7w6dOggVq1a1eK+7dixQwghRDAYFB07dhSjRo0SgJg6dep+v6e8vFwA4u9//3vjtlNPPVUMHjy4WbsPPvhAAOLrr79u3Ob3+/fpb/z48SIqKkoEg8Fm27du3SoAMX369MZtEydOFE0vyR07duwz3jFjxojevXuLjIwMMW7cuP3uhxBC9OrVS4wePfqA41u6dKkAxIwZMxq3NRy/FStWNGtbUVHR4u+197b7779ftGnTRuTm5ophw4Y1bt+yZYsAxKRJk5p9ftiwYeKkk0464L400HAuzZkzp3Hb7bffLva+lR2NfW34fSoqKpq1XbFixT6/07hx40R0dPQB96WlY/nQQw8JVVXFN998I2bPni0A8eKLLx6wn8Pdj4suukjY7Xaxbdu2xm0lJSUiNjZWnHHGGY3bHnvsMQGIDz74YJ/va7i2G7jqqqtEly5dDrh/Z555pujTp0+za8EwDHHaaaeJ7t27H3D//ve//wlA3HXXXQccCyBuv/32fdqcf/75olOnTo1/H+61peu6uOCCC0RaWpooLCxs9vlwOCyGDx8uunTpIsrLy5t97qmnnhLR0dEiPz+/2fYHH3xQWCwWsXv37mbjee655/YZ+0knndTsOmq4bza99wwePLjx/tb0mE+ZMkUAYunSpc363N9xakrD93To0EHU1NQ0bp81a5YAxEsvvdS4raXrbdKkSUJRFLFr1679fkdL99uG+0Hfvn2bbZ85c6ZIT08XQ4cObXa/+PbbbwUg3n777WZ9L1iwYJ/tnTp1EoCYO3du4zav1yvS0tJETk5O47ZjfT0JYd5Dmp6zTdn7WVFeXi5iY2Mbz4Gm54ZECOm68BvnrLPOIjU1tTGDQUxMDB9++CEdOnQATEthbGzsIfe
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 800x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Импортируем PCA и визуализируем данные\n",
|
|||
|
"from sklearn.decomposition import PCA\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"\n",
|
|||
|
"# Применяем PCA для снижения размерности до 2\n",
|
|||
|
"pca = PCA(n_components=2)\n",
|
|||
|
"df_pca = pca.fit_transform(df_scaled)\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация\n",
|
|||
|
"plt.figure(figsize=(8, 6))\n",
|
|||
|
"plt.scatter(df_pca[:, 0], df_pca[:, 1], c='red', edgecolor='k', alpha=0.6)\n",
|
|||
|
"plt.title(\"PCA: Визуализация данных после снижения размерности\")\n",
|
|||
|
"plt.xlabel(\"Главная компонента 1\")\n",
|
|||
|
"plt.ylabel(\"Главная компонента 2\")\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# Определим количество кластеров\n",
|
|||
|
"## Это важно, потому что оптимальное количество кластеров позволит разделить данные так, чтобы не были упущенны важные моменты. Кроме того, это сделает данные понятными и полезными, повлияет и на способность обощения модели. Если кластеров слишком много, то модель будет обращать излишнее внимание на шум, если кластеров слишком мало, то модель будет игнорировать важные моменты"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# Для разделения на кластеры воспользуемся методом локтя и коэффициентом силуэта\n",
|
|||
|
"## Метод локтя измеряет инерцию, то есть размерность ошибок внутри кластеров. Инерция — это сумма квадратов расстояний от точек до центроидов их кластеров. Чем меньше будет значение, тем лучше\n",
|
|||
|
"## Коэффициет силуэта покажет, насколько хорошо распределены объекты по кластерам. Он оценивает плотность кластеров и принимает значения от -1 до 1. Чем ближе результат к 1, тем лучше."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 30,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAsQAAAIjCAYAAAAEFA25AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACHq0lEQVR4nOzdd1gU1/4G8Hd26V0EBAURRbFgxYZgiw0raGI3auwGVExioslN1JgEk3gTY9dYsMTeS8QugmLvXSwgCiogve/O74/82OsGUEBgFvb9PM8+9zJ7ZuY7h93J63DmjCCKoggiIiIiIi0lk7oAIiIiIiIpMRATERERkVZjICYiIiIircZATERERERajYGYiIiIiLQaAzERERERaTUGYiIiIiLSagzERERERKTVdKQugIiI3i4rKwvx8fFQKpWoWrWq1OUQEVU4vEJMRKSBLl68iCFDhsDKygr6+vqws7PDhx9+KHVZREQVEgMxVSiBgYEQBAGCICA0NDTP+6IowsHBAYIgoFevXhJUSPRue/bsgaenJ27fvo0ff/wRR44cwZEjR7B8+XKpSyMiqpA4ZIIqJAMDA2zcuBGenp5qy4ODgxEVFQV9fX2JKiN6u/j4eIwZMwbdunXDtm3boKenJ3VJREQVHq8QU4XUo0cPbNu2DTk5OWrLN27cCDc3N9ja2kpUGdHbrVmzBhkZGQgMDGQYJiIqIwzEVCENHjwYcXFxOHLkiGpZVlYWtm/fjiFDhuS7jlKpxPz589GgQQMYGBigSpUqGD9+PF6/fq1qU6NGDdWQjPxeNWrUULVNTU3F559/DgcHB+jr68PFxQXz5s2DKIp59n3y5MkCt1lYI0eOzHf9WbNmqbU7fvw42rZtC2NjY1hYWMDb2xt37txRazNr1qw8+z5x4gT09fUxYcIEtTZve508eVK1/tKlS+Hq6gojIyO1Ntu3by/U8XXo0KFQxweoD51589WhQwe1dleuXIGXlxesra3V2r1rOM2TJ0/U2uvq6qJGjRqYNm0asrKy8tRx8eLFtx7Xm3WdPXsWTZo0wU8//aT67NSuXRtz586FUqlUWzcnJwdz5sxBrVq1oK+vjxo1auDrr79GZmamWrsaNWqgV69eOHz4MJo0aQIDAwPUr18fO3fuVGsXHx+PL774Ag0bNoSJiQnMzMzQvXt3XLt27a39kevfv4+cnBz06NEDlpaWuH37dpHrBgr+brz5Xctt8+bnDQB69uyZp6Z/9zfwv99nYGCg2vK7d+/io48+gqWlJQwMDNC8eXPs3bs3T40JCQmYOnUqatSoAX19fdjb22P48OGIjY1963f735/hf3+nTE1N0bJlS+zevVttfyEhIejfvz+qV68OfX19ODg4YOrUqUhPT89T27/lfiafPHmiWnbr1i1UqlQJvXr1ynMRoaDv3Zt9VZR67t69iwEDBsDa2hqGhoZwcXHBN998k+/xv+uccu7cOXh5ecHc3BxGRkZo3749Tp8+rba/3G3m7tfMzAyVK1fGlClTkJGRoda2KN+n3HpkMhlsbW0xcOBAREZGvrP/SXNxyARVSDVq1IC7uzs2bdqE7t27AwAOHjyIxMREDBo0CAsWLMizzvjx4xEYGIhPPvkEkydPxuPHj7Fo0SJcuXIFp0+fhq6uLubPn4+UlBQAwJ07d/DTTz/h66+/Rr169QAAJiYmAP4Zq9ynTx+cOHECo0ePRpMmTXDo0CFMmzYNz549w++//55v3ZMnT0aLFi0AAOvWrVML9IVhZWWltu2PP/5Y7f2jR4+ie/fuqFmzJmbNmoX09HQsXLgQHh4euHz5slrIeNO1a9fg4+ODHj16YPHixQCAfv36wdnZWdVm6tSpqFevHsaNG6daltsvW7ZswaeffooOHTpg0qRJMDY2VvVfUdjb2yMgIAAAkJKSgokTJ761/e+//w4rKysAwI8//qj2XmJiIrp37w5RFPHZZ5/BwcFBdRyFNW7cOLRt2xaZmZk4dOgQ5s2bBwMDA8yZM6coh6UmLi4OoaGhCA0NxahRo+Dm5oZjx45hxowZePLkCZYtW6ZqO2bMGKxduxYfffQRPv/8c5w7dw4BAQG4c+cOdu3apbbdBw8eYODAgZgwYQJGjBiBNWvWoH///ggKCkKXLl0AAI8ePcLu3bvRv39/ODk54cWLF1i+fDnat2+P27dvF3mGizFjxuDkyZM4cuQI6tevX6y6c735PVuxYsU7w8epU6fw999/F6neN926dQseHh6oVq0apk+fDmNjY2zduhU+Pj7YsWMH+vbtC+Cfz2Hbtm1x584djBo1Cs2aNUNsbCz27t2LqKgo1KtXD+vXr1dtd8WKFbhz547a97RRo0Zq+85tHxsbiyVLlqB///64efMmXFxcAADbtm1DWloaJk6ciMqVK+P8+fNYuHAhoqKisG3btiId59OnT+Hl5YW6deti69at0NHJGwvq1q2rCq2xsbF5viOFref69eto27YtdHV1MW7cONSoUQMPHz7Evn378OOPPxbpnHL8+HF0794dbm5umDlzJmQyGdasWYMPPvgAISEhaNmypVqNAwYMQI0aNRAQEICzZ89iwYIFeP36NdatW6dqU5TPZdu2bTFu3DgolUrcvHkT8+fPx/PnzxESElKk/icNIhJVIGvWrBEBiBcuXBAXLVokmpqaimlpaaIoimL//v3Fjh07iqIoio6OjmLPnj1V64WEhIgAxL/++ktte0FBQfkuF0VRPHHihAhAPHHiRJ73du/eLQIQf/jhB7XlH330kSgIghgeHq62/PDhwyIAcfv27aplvr6+YlG+okOHDhWdnJzUlgEQZ86cqfq5SZMmoo2NjRgXF6dadu3aNVEmk4nDhw9XLZs5c6Zq30+ePBHt7OxET09PMT09vcD9Ozo6iiNGjMj3vcGDB4sWFhZq6+f237Zt2wp1fG3atBFdXV1VP7969SrP8eX6888/RQBiRESEaln79u3F9u3bq34+dOiQCEDctGlTnuN487ORn8ePH4sAxDVr1qgtr1q1qtijRw/Vz29+Hgvy77rat28vAhBnzZql1m7kyJEiAPHGjRuiKIri1atXRQDimDFj1Np98cUXIgDx+PHjascEQNyxY4dqWWJiomhnZyc2bdpUtSwjI0NUKBR5jlVfX1/8/vvvCzyGXG/+PmbMmCHK5XJx9+7dam2KUrcoiuKRI0dEAGJwcLBq2YgRI0RHR0fVz/l9F1u1aiV27949z2ekY8eOYrt27fIc479/n506dRIbNmwoZmRkqJYplUqxTZs2Yu3atVXLvvvuOxGAuHPnzjz9oVQq8yz7d+1vevN7lyv33LB161bVstxz2psCAgJEQRDUPvP5yf1MPn78WIyPjxfr168vuri4iLGxsfm29/DwUJ03RTH/vipsPe3atRNNTU3z1JhfP4liwecUpVIp1q5dW+zWrZvaumlpaaKTk5PYpUsX1bLcPu3Tp4/aNj799FMRgHjt2jVRFIv+ffp3XUOGDBGNjIzyPQ4qHzhkgiqsAQMGID09Hfv370dycjL2799f4HCJbdu2wdzcHF26dEFsbKzq5ebmBhMTE5w4caJI+/77778hl8sxefJkteWff/45RFHEwYMH1Zbn/unOwMCgSPt5U1ZW1ltvFoyOjsbVq1cxcuRIWFpaqpY3atQIXbp0yfdqWlxcHLp16wZTU1Ps3bu32PUlJyfDyMjovY4vIyOj0OvnDlt4W38kJycDACpXrlzsmlJSUhAbG4tnz55hxYoViImJQadOnfK0S0xMRGxsrGqf7yKXy/Nchfv8888BAAcOHAAA1e/rs88+e2u7XFWrVlVd1QQAMzMzDB8+HFeuXEFMTAyAf/pLJvvnPwsKhQJxcXEwMTGBi4sLLl++XKjaAWDRokUICAjAggUL4O3trfZeUesuzO/y33bu3IkLFy5g7ty5ed6zsbFBVFTUW9ePj4/H8ePHMWDAACQnJ6vOB7nfhwcPHuDZs2cAgB07dqBx48ZqfZurKEOe3pS7vzt37mDZsmUwNjZG69atVe8bGhqq/n9qaipiY2PRpk0biKKIK1euFGofGRkZ6NOnD16
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 800x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAscAAAIjCAYAAADvI7a6AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACHSElEQVR4nOzdd1xV5R8H8M+9bJmyQUFAkOHCiZgjBbepqTnLkWkpbitXufqVZZa7rDRHuXJkakbhXuQAUUFERFRkI7Jl3vP7A7h5ZchF4HDh83697qs45znnfs/D5fDx8JznSARBEEBERERERJCKXQARERERUW3BcExEREREVIThmIiIiIioCMMxEREREVERhmMiIiIioiIMx0RERERERRiOiYiIiIiKMBwTERERERVhOCYiIiIiKsJwTERERERUhOGYVN727dshkUhw7dq1Eut++uknSCQSDBkyBAUFBTVSz8CBA2FnZ6f0dtOnT4dEIqn6goiIiKjCGI6pzvr9998xdepUdO3aFXv37oWamprYJREREVEtx3BMddKZM2cwevRouLm54ejRo9DW1ha7JCIiIlIBDMdU5wQFBWHw4MGwsrLC33//DUNDwxJt9u/fj3bt2kFHRwempqZ4++23ER0dLV8fHR2N0aNHo1GjRtDS0oKDgwM+/vhjpKenl9jXL7/8AhsbGxgZGWHlypXy5fv27YO1tTVMTU3x1Vdfldju77//RrNmzaCnp4eZM2dCEAQAhcG+adOmMDAwwNy5cxWGg5w5cwYSiQRnzpxR2NeAAQMgkUiwbNky+bJly5ZBIpEgKSlJoe21a9cgkUiwfft2+bIHDx6UWAYAPj4+kEgkmDBhgsLylJQUzJ49GzY2NtDS0oKjoyO++uoryGSyEvtcvXp1iWNv0aIFXn/9dYVjKu/1/HGVJjs7G8uWLUOzZs2gra0NKysrDB06FBEREZU6PgB4/fXXS62leB9Lly6FhoYGEhMTS2w7ZcoUGBkZITs7G+fPn4e3tzdMTU2ho6ODNm3a4Pvvv5d/v8t7r+dfxbZt24aePXvC3NwcWlpacHNzw/fff19u/7zIzs6u3GN7XvHn6MXXi312+vRpdO3aFQ0bNlRoN3369HJrKf7+79u3D4sWLYKlpSV0dXUxaNAgREVFKbQ9f/483nrrLdja2kJLSws2NjaYM2cOnj17VqJmNzc36OnpwcDAAJ06dcLhw4dLvHfx56K8/lbmszNhwoRSh1S9ys/mhAkToKenV3rnlbL/Z8+ewcXFBS4uLgr9kpycDCsrK3Tu3LncIWbFw9QePHggXxYSEoKGDRti4MCByM/PV2j/sp8ToOLfNwC4c+cORowYATMzM+jo6MDZ2RmLFy9W6LfyXs+fGy9fvoy+ffvC0NAQDRo0QPfu3XHx4kWF9yveZ/H7GhgYwMTEBLNmzUJ2drZC2/z8fHz22Wdo2rQptLS0YGdnh0WLFiEnJ0eh3fM/X1KpFJaWlhg5ciQePXpUZr9T7aIudgFEVSkiIgJ9+/aFlpYW/v77b1hZWZVos337dkycOBEdOnTAypUrER8fj3Xr1uHixYu4fv06jIyMEBERgfj4eMyYMQMNGzZESEgI1q9fj5MnT+LChQvQ0dEBAFy8eBHjx49H586dMXr0aPzyyy+4f/8+nj17hhUrVmDRokX4559/sGDBAtja2mL06NEAgPv372PIkCFwdHTEF198AV9fX/mYaR8fH8yYMQPXr1/HmjVrYGZmhoULF5Z5zOfOncPx48ervC/v3buHn376qcTyrKwsdO/eHdHR0Xj//fdha2uLS5cuYeHChYiNjcXatWuVeh9XV1f88ssv8q9//PFHhIaGYs2aNfJlrVq1KnP7goICDBw4ECdPnsSoUaMwa9YspKenw8/PD8HBwWjatKlSx/c8FxcX+S/mpKQkzJkzR77unXfewYoVK7Bv3z6FAJibm4sDBw5g2LBh0NbWxqVLl2Bubo5PPvkEampqOHv2LKZNm4abN2/KQ+3ixYvx3nvvKbzPlClT0LVr1xI1ff/992jevDkGDRoEdXV1HD16FNOmTYNMJoOPj0+5x/M8d3d3zJs3DwAQGRmJJUuWlNv++e/R8/1QvP2AAQNgZWWFJUuWwMzMTN5HFfX5559DIpFg/vz5SEhIwNq1a+Ht7Y2goCD5z9v+/fuRlZWFqVOnwsTEBFeuXMGGDRvw+PFj7N+/X76vzMxMvPnmm7Czs8OzZ8+wfft2DBs2DP7+/ujYsWOJ936+rw8dOoTff/+93For8tkRi46ODnbs2IHXXnsNixcvxrfffgug8LySmpqK7du3KzXELCoqCn379oWLiwt+++03qKuXjA3l/ZwAFf++3bx5E127doWGhgamTJkCOzs7RERE4OjRo/j8888xdOhQODo6ytvPmTMHrq6umDJlinyZq6srAODUqVPo168f2rVrh6VLl0Iqlcr/YXn+/PkSn4MRI0bAzs4OK1euxL///ov169fj6dOn2Llzp7zNe++9hx07dmD48OGYN28eLl++jJUrVyI0NLTEZ6Zr166YMmUKZDIZgoODsXbtWsTExOD8+fMV7nsSkUCk4rZt2yYAEI4dOyY0bdpUACD07t271La5ubmCubm50KJFC+HZs2fy5ceOHRMACEuWLCnzffz8/AQAwooVK+TLBg0aJNjb2wvZ2dmCIAhCenq6YG9vLzRo0EC4f/++IAiCIJPJhNdee01o3bq1fLuZM2cK+vr6QlJSkiAIgpCXlyd06tRJACBcvnxZ3m706NGCubm5fP+nT58WAAinT5+Wt/Hw8BD69esnABCWLl0qX7506VIBgJCYmKhwHFevXhUACNu2bZMvi4yMLLFsxIgRQosWLQQbGxth/Pjx8uWfffaZoKurK9y9e1dhvwsWLBDU1NSER48eKezz66+/LtGXzZs3F7p3715iuSAIwvjx44UmTZqUuq40P//8swBA+Pbbb0usk8lkSh9fsddee03o0aOH/OvS9uHp6Sl4eHgobHfo0KES36MXLV68WAAgnDt3rsS60t7neVlZWSWW9enTR3BwcCjz/V5kbW0tDBw4UP51aZ+J52uVSCQKy5o0aaLQZz/88IMAQPD391doB0Dw8fEpt5biz3SjRo2EtLQ0+fLffvtNACCsW7dOvqy0Y1+5cqUgkUiEhw8flvkeCQkJAgBh9erVCsvDw8MFAMKOHTvky4p/boop89mZOHGiYGtrW+L9X+Vnc/z48YKurm6Zx1ba/gVBEBYuXChIpVLh3Llzwv79+wUAwtq1a8vdjyD8dz6NjIwUkpOTBTc3N8HZ2Vl+rnpRRX5OKvp969atm6Cvr1/ie1n8c/yiFz+Hz7d3cnIS+vTpo7BtVlaWYG9vL/Tq1Uu+rPh7MWjQIIV9TJs2TQAg3LhxQxAEQQgKChIACO+9955Cuw8//FAAIJw6darcusaMGSM0aNCg1OOg2ofDKqjOmDBhAqKiojBmzBj8888/Clckil27dg0JCQmYNm2awjjkAQMGwMXFBX/++ad8WV5eHpKSkuQvd3d3tG/fXmG/J0+eRP/+/aGlpQUA0NPTg5ubG8zMzGBvbw8A8tkybty4gSdPnsi369atG0xMTAAA6urqaNeuHQAoXNEYOnQoEhISEBwcXOoxHzp0CFevXsWXX35ZqT4rS0BAAPbv34+VK1dCKlU8Tezfv1/+5/Pn+8fb2xsFBQU4d+6cQvusrCyFdklJSVU6c8jBgwdhamqKGTNmlFhX1uwf5R1fsdzcXPn3tSzjxo3D5cuX5cM3AGDXrl2wsbFB9+7d5cte7IPJkydDQ0Oj1M/oyxRfRQWA1NRUJCUloXv37rh//z5SU1MrtI/s7OwKj8OvSD8UDzcq/jxXxrhx46Cvry//evjw4bCyslL4q8jzx56ZmYmkpCR07twZgiDg+vXrCvsr/vmNiIjAl19+CalUitdee63EsQF46fE9r7zPjrm5ORISEuT7fZnk5GSFz0V537/iNi/+qb8sy5YtQ/PmzTF+/HhMmzYN3bt3x8yZMyu0LVD4GRk0aBASExPh6+tb5ve2Ip+PinzfEhMTce7
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 800x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Метод локтя\n",
|
|||
|
"from sklearn.cluster import KMeans\n",
|
|||
|
"\n",
|
|||
|
"border_l = 2\n",
|
|||
|
"border_r = 5\n",
|
|||
|
"\n",
|
|||
|
"inertia = []\n",
|
|||
|
"for k in range(border_l, border_r):\n",
|
|||
|
" kmeans = KMeans(n_clusters=k, random_state=42)\n",
|
|||
|
" kmeans.fit(df_scaled)\n",
|
|||
|
" inertia.append(kmeans.inertia_)\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация метода локтя\n",
|
|||
|
"plt.figure(figsize=(8, 6))\n",
|
|||
|
"plt.plot(range(border_l, border_r), inertia, marker='o')\n",
|
|||
|
"plt.title('Метод локтя для выбора количества кластеров')\n",
|
|||
|
"plt.xlabel('Количество кластеров')\n",
|
|||
|
"plt.ylabel('Инерция')\n",
|
|||
|
"plt.show()\n",
|
|||
|
"\n",
|
|||
|
"# Коэффициент силуэта\n",
|
|||
|
"from sklearn.metrics import silhouette_score\n",
|
|||
|
"\n",
|
|||
|
"silhouette_scores = []\n",
|
|||
|
"for k in range(border_l, border_r):\n",
|
|||
|
" kmeans = KMeans(n_clusters=k, random_state=42)\n",
|
|||
|
" kmeans.fit(df_scaled)\n",
|
|||
|
" score = silhouette_score(df_scaled, kmeans.labels_)\n",
|
|||
|
" silhouette_scores.append(score)\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация коэффициента силуэта\n",
|
|||
|
"plt.figure(figsize=(8, 6))\n",
|
|||
|
"plt.plot(range(border_l, border_r), silhouette_scores, marker='o')\n",
|
|||
|
"plt.title('Коэффициент силуэта для различных кластеров')\n",
|
|||
|
"plt.xlabel('Количество кластеров')\n",
|
|||
|
"plt.ylabel('Коэффициент силуэта')\n",
|
|||
|
"plt.show()\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# Кластеризируем с помощью K-means\n",
|
|||
|
"## K-means — это алгоритм кластеризации, который группирует данные вокруг центров (центроидов) кластеров. Я выбираю количество кластеров, равное 2, исходя из коэффициента силуэта на диаграмме выше"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 28,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAqcAAAIjCAYAAAA+xLLKAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd1gU1/rA8e/M9l16r4KCKCCKYu+9l5hieqKm93Jzk5vkprebm140PZpiTCwxMWrsXVGxgF2UIiC9Lixsn98fXPmFoEYTjMacj88+j8zMzpw5O8DLKe+RFEVREARBEARBEISLgHyhCyAIgiAIgiAIJ4ngVBAEQRAEQbhoiOBUEARBEARBuGiI4FQQBEEQBEG4aIjgVBAEQRAEQbhoiOBUEARBEARBuGiI4FQQBEEQBEG4aIjgVBAEQRAEQbhoiOBUEARBEARBuGiI4FQQBEH4Wxs7diy33XbbhS5GM8uXL8fDw4OysrILXRRB+NOJ4FQ4Z7Nnz0aSJHbu3Nli3yeffIIkSVx22WW4XK4LUDpB+PsaPHgwnTp1arF9zZo1GI1GunXrRmVl5QUo2cVry5YtrFy5kscee6xp2/r165Ekqeml0Who164dN910E9nZ2S3OYTabee655+jSpQseHh4YDAY6derEY489RmFh4SmvO2XKFCRJanbdXxo9ejSxsbG88sorrXOjgvAXIoJTodUsWrSIu+66iwEDBvDtt9+iUqkudJEE4W9v7dq1TJgwgQ4dOrB69Wr8/PwudJEuKq+99hrDhg0jNja2xb7777+fr776io8//phx48bx3Xff0aNHj2YBZ3Z2NsnJybzwwgskJCTw6quv8u677zJkyBA+++wzBg8e3OK8ZrOZn376iejoaObOnYuiKKcs2x133MFHH31EbW1tq92vIPwViOBUaBXr16/n2muvJSEhgZ9++gm9Xn+hiyQIf3sbNmxgwoQJxMXFicD0FEpLS1m6dClTpkw55f4BAwZwww03MG3aNN577z1ef/11Kisr+eKLLwBwOp1cfvnllJSUsH79eubOncs999zDbbfdxnvvvUd2djZXXXVVi/MuXLgQl8vF559/Tn5+Phs3bjzl9a+44gpsNhvz589vvZsWhL8AEZwKf1h6ejqTJk0iNDSUFStW4O3t3eKY3NzcZt1kv3z90uuvv07fvn3x9/fHYDCQkpLCggULTnndr7/+mp49e2I0GvH19WXgwIGsXLkSgOjo6NNeT5IkoqOjm87jdrt5++23SUxMRK/XExwczB133EFVVVWz60VHRzN+/HhWrlxJcnIyer2ehIQEvv/++2bHnWnYw0mDBw8+ZYvK2Vq7di0DBgzAZDLh4+PDpEmTOHToULNjnn322TPWwezZs5uOnTp1Kh4eHmRnZzNq1ChMJhNhYWE8//zzLVp1zra+4Myfe25ubrNjq6urefDBB4mMjESn0xEbG8urr76K2+1ucd7T3dvUqVObHXfixAmmT59OcHAwOp2OxMREPv/882bHnOzCPdVz5uHh0eycp/tsy8vLkSSJZ599tkUZy8vLW5z3pOjo6BZlPpd6OJNNmzYxbtw4YmNjWb16Nf7+/mc8/uTzvX79erp3747BYCApKYn169cD8P3335OUlIReryclJYU9e/a0OMfhw4e58sor8fPzQ6/X0717dxYvXtzsmMrKSh555BGSkpLw8PDAy8uLMWPGkJGR0ey4k5/LvHnzeOmll4iIiECv1zNs2DCOHTvW7NijR49yxRVXEBISgl6vJyIigmuuuYaampoz3vPSpUtxOp0MHz78jMedNHToUABycnKAxiAzIyODJ598kv79+7c43svLi5deeqnF9jlz5jBixAiGDBlCfHw8c+bMOeX1goKC6Ny5Mz/++ONZlU8QLhXqC10A4a8tKyuL0aNHo9PpWLFiBaGhoWc8/vbbb2fAgAFA4y+7RYsWNdv/zjvvMHHiRK6//nrsdjvffvstV111FUuWLGHcuHFNxz333HM8++yz9O3bl+effx6tVsv27dtZu3YtI0eO5O2336aurg6AQ4cO8fLLL/PEE08QHx8PNAYdJ91xxx3Mnj2badOmcf/995OTk8P777/Pnj172LJlCxqNpunYo0ePcvXVV3PnnXdy8803M2vWLK666iqWL1/OiBEj/lhlnqXVq1czZswY2rVrx7PPPktDQwPvvfce/fr1Y/fu3c0Cb4APPvig2f3m5OTw9NNPtzivy+Vi9OjR9O7dm//+978sX76cZ555BqfTyfPPP9903LnU10nXXnstY8eOBWDZsmXMnTu32f76+noGDRrEiRMnuOOOO2jTpg1bt27l8ccfp6ioiLfffvuUdfHVV181/f+hhx5qtq+kpITevXsjSRL33nsvgYGB/Pzzz9xyyy2YzWYefPDBU57zQvq99fBrW7ZsYezYsbRt25Y1a9YQEBBwVu87duwY1113HXfccQc33HADr7/+OhMmTODDDz/kiSee4O677wbglVdeYcqUKRw5cgRZbmzjOHDgAP369SM8PJx//etfmEwm5s2bx2WXXcbChQuZPHky0NgN/sMPP3DVVVfRtm1bSkpK+Oijjxg0aBAHDx4kLCysWZn+85//IMsyjzzyCDU1Nfz3v//l+uuvZ/v27QDY7XZGjRqFzWbjvvvuIyQkhBMnTrBkyRKqq6tP+cfySVu3bsXf35+oqKizqp+srCyApkD/ZOB94403ntX7AQoLC1m3bl1T6+u1117LW2+9xfvvv49Wq21xfEpKCj/88MNZn18QLgmKIJyjWbNmKYCyZMkSJSYmRgGUkSNHnvE9R48eVQDliy++aNr2zDPPKL9+BOvr65t9bbfblU6dOilDhw5tdi5ZlpXJkycrLper2fFut7vFtdetW6cAyrp161rs27RpkwIoc+bMabZ9+fLlLbZHRUUpgLJw4cKmbTU1NUpoaKjStWvXpm0n6yctLe1UVaEoiqIMGjRIGTRo0Gn3n0lycrISFBSkVFRUNG3LyMhQZFlWbrrppqZtJ+u3rKys2fvT0tIUQJk1a1bTtptvvlkBlPvuu69pm9vtVsaNG6dotdqmc5xLfSmKomRmZiqA8vrrrzdte+211xRAycnJadr2wgsvKCaTScnMzGz2/n/961+KSqVS8vLymm1/8sknFUmSmm2LiopSbr755qavb7nlFiU0NFQpLy9vdtw111yjeHt7Nz1rJ5+P+fPnK79mMpmanfN0n21ZWZkCKM8880zTttPV/5nKfK718GuDBg1S/Pz8FE9PTyUxMVEpLS094/G/LgugbN26tWnbihUrFEAxGAzK8ePHm7Z/9NFHLb6nhg0bpiQlJSlWq7Vpm9vtVvr27au0b9++aZvVam3xfZuTk6PodDrl+eefb9p28nOJj49XbDZb0/Z33nlHAZR9+/YpiqIoe/bsOe3n91v69++vpKSktNh+8tqff/65UlZWphQWFipLly5VoqOjFUmSmj7/rl27Kt7e3ud0zddff10xGAyK2WxWFOX/v0cWLVp0yuNffvllBVBKSkrO6TqC8FcmuvWF323q1Knk5+dz3XXXsXLlyjOOi7Lb7QDodLozntNgMDT9v6qqipqaGgYMGMDu3bubtv/www+43W6efvrpplabk349TOC3zJ8/H29vb0aMGEF5eXnTKyUlBQ8PD9atW9fs+LCwsKYWIGjstrvpppvYs2cPxcXFzY6tqamhvLy8VSczFBUVkZ6eztSpU5uNH+zcuTMjRoxg2bJlf+j89957b9P/T7Y42u12Vq9eDZx7fVmtVoDfHIM8f/58BgwYgK+vb7PzDh8+HJfL1WJMnt1uP+OzpCgKCxcuZMKECSiK0uyco0aNoqamptkzBVBbW9vsuDN1x5/8bE++zjQDvrKykvLyciwWyxnr4PfUw6lYLBZqa2sJDg7Gy8vrN4//pYSEBPr06dP0da9evYDG7uw2bdq02H5y5nplZSVr165lypQpzeqxoqKCUaNGcfToUU6cOAE0/gw4+X3rcrmoqKjAw8ODDh06tPhMAKZNm9asRfFkz8vJa59sGV2xYgX19fXndL8VFRX4+vq
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 800x600 with 2 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Кластеризация с помощью K-means\n",
|
|||
|
"optimal_clusters = 2\n",
|
|||
|
"kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)\n",
|
|||
|
"df['Cluster'] = kmeans.fit_predict(df_scaled)\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация кластеров с использованием PCA\n",
|
|||
|
"plt.figure(figsize=(8, 6))\n",
|
|||
|
"plt.scatter(df_pca[:, 0], df_pca[:, 1], c=df['Cluster'], cmap='viridis', edgecolor='k', alpha=0.6)\n",
|
|||
|
"plt.title(\"Кластеры, определенные K-means (PCA)\")\n",
|
|||
|
"plt.xlabel(\"Главная компонента 1\")\n",
|
|||
|
"plt.ylabel(\"Главная компонента 2\")\n",
|
|||
|
"plt.colorbar(label='Кластер')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# Приступим к иерархической кластеризации\n",
|
|||
|
"## Иерархическая кластеризация — метод, который строит древовидную структуру кластеров (дендрограмму). Применим её для сравнения с K-means"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 29,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA/AAAALBCAYAAAD/DkqyAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAADvmUlEQVR4nOzdd5xTVfrH8W+mZAodlI6AgIuKqGBBEAug6FrAiqACYkNURGX92QuyYmEVdRUL0hTsyrqri7oodulF1HVBEYaqtBlghklmcn5/jAnpk8wkuSmf9+s1L5KbO+GZlHvvc85zzrEZY4wAAAAAAEBSy7I6AAAAAAAAUD0SeAAAAAAAUgAJPAAAAAAAKYAEHgAAAACAFEACDwAAAABACiCBBwAAAAAgBZDAAwAAAACQAkjgAQAAAABIASTwAAAAAACkABJ4AAAAVMtms+n++++3OgzLzJ8/XzabTfPnz7c6FI9Ufk/uv/9+2Ww2q8MAUg4JPJCC3nrrLdlstqA/Xbp0sTo8AEAcPfvss7LZbDr++OOtDiUtPfvss5o+fbqlMSxfvlyXXXaZ2rRpo7y8PDVu3Fj9+vXTtGnTVFlZmZAYNm3apPvvv1/Lly9PyP8HIDI5VgcAoObuvPNOHXrooZ77f/3rXy2MBgCQCLNmzVK7du20cOFCrVmzRh07drQ6pLTy7LPP6oADDtDw4cN9tp900kkqKyuT3W6P6/8/ZcoUjRw5Us2aNdPll1+uTp06affu3Zo3b56uvPJKbd68WXfeeWdcY5CqEvgHHnhA7dq101FHHRXz57/77rt1++23x/x5gXRHAg+ksNNOO02nnHKK5/6UKVO0bds26wICAMTV2rVr9fXXX+udd97Rtddeq1mzZum+++6zOqykZYzRvn37VFBQUOvnysrKUn5+fgyiCu3bb7/VyJEjdcIJJ+iDDz5QvXr1PI+NGTNGixcv1qpVq+IaQ7zt3btXderUUU5OjnJySEWAaFFCD6Qgh8MhqepiIhK7du3SmDFjPKV4HTt21COPPCKXy+XZ59dff5XNZtPEiRMDfr9Lly4+DQVu7vFr/j/B9n3zzTfVvXt3FRQU6IADDtBll12mjRs3+uwzfPjwoM/n3bvUrl07nX322froo4901FFHKT8/X4cddpjeeecdn+fasWOHxo4dqyOOOEJ169ZV/fr1deaZZ2rFihU++7nHNNpstoAywY0bNyo7O1s2m01vvfVWQJzBeiQmTJggm82munXr+myfNm2a+vTpo6ZNmyovL0+HHXaYJk+eHPD7kfKO22azKS8vT4cccogmTJggY4zPvsuWLdOZZ56p+vXrq27duurbt6++/fbbgOfctWuXbr75ZrVr1055eXlq3bq1hg4d6tMotG/fPt1///065JBDlJ+frxYtWuj888/Xzz//7PkMhftx92hF+/54v/5udevWDegh++WXX3TRRRepcePGKiwsVI8ePfT+++/X+LWTqj5zwf4W/3Gw//73v9W7d2/VqVNH9erV01lnnaXvv//eZ5/hw4cHfDak/cNivJ/zlFNOCTokZuLEibLZbPr111892/7xj3/orLPOUsuWLZWXl6cOHTrowQcfDFpqO3nyZHXp0kWFhYU+f0+w19hbsPGqe/bsUfPmzQNiHzlypDp16qTCwkI1btxYffr00RdffBH2+aWafbcqKir04IMPqkOHDsrLy1O7du105513qry8POA5wn1GvV9PKbLjpv9rE+rzHu3rF86sWbPUqFEjnXXWWbrwwgs1a9asoPtt375dl19+uerXr6+GDRtq2LBhWrFihWw2W0B5+JtvvqnDDjtM+fn56tKli959910NHz5c7dq1qzaeSI4v06dPl81m05dffqnRo0frwAMPVMOGDXXttdfK4XBo165dGjp0qBo1aqRGjRrptttuC/guulwuTZo0SYcffrjy8/PVrFkzXXvttdq5c6fPfu5zxIcffqhjjjlGBQUFev755yVFdhxu166dvv/+e3322WcB5zT/MfA33HCD6tatq9LS0oDXZfDgwWrevLnPdzCSY8QDDzwgm82mWbNm+STvbsccc0zAcc9bqPct2Ofv448/1oknnqiGDRuqbt26+tOf/uTp2Z8/f76OPfZYSdIVV1zheS28PzsLFizQGWecoQYNGqiwsFAnn3yyvvrqq6D/7w8//KAhQ4aoUaNGOvHEE0PGZLPZdMMNN2jOnDnq0qWL8vLydPjhh2vu3LkBf9P8+fN1zDHHKD8/Xx06dNDzzz/PuHpkBJq9gBTkTuDz8vKq3be0tFQnn3yyNm7cqGuvvVYHHXSQvv76a91xxx3avHmzJk2aVOt4Jk+e7LmovuOOOwIenz59uq644gode+yxmjBhgrZu3aonn3xSX331lZYtW6aGDRt69s3Ly9OUKVN8ft//Imb16tUaNGiQRo4cqWHDhmnatGm66KKLNHfuXJ122mmSqhK5OXPm6KKLLlL79u21detWPf/88zr55JP1ww8/qGXLlj7PmZ+fr2nTpunJJ5/0bJsxY4bsdrv27dsX8Dfl5OTo+++/17Jly3T00Uf7/K3BemgmT56sww8/XOeee65ycnL0z3/+U6NGjZLL5dL1118f6qWtlnsYRVlZmV5//XXdeeedatq0qa688kpJ0vfff6/evXurfv36uu2225Sbm6vnn39ep5xyij777DPPGNo9e/aod+/e+vHHHzVixAh169ZN27Zt03vvvacNGzbogAMOUGVlpc4++2zNmzdPl1xyiW666Sbt3r1bH3/8sVatWqV+/frp5Zdf9sT2zjvv6N133/XZ1qFDB0nRvz+R2Lp1q3r27KnS0lKNHj1aTZo00YwZM3Tuuefqrbfe0nnnnRfVa+etd+/euuaaayRJP/74ox566CGfx19++WUNGzZM/fv31yOPPKLS0lJNnjxZJ554opYtWxZRIlQb06dPV926dXXLLbeobt26+uSTT3TvvfeqpKREjz32mGe/119/XaNGjdIpp5yiG2+8UXXq1An690Tqb3/7m7Zu3Rqw3eFw6LLLLlPr1q21Y8cOPf/88zrjjDP0448/6qCDDgr7nNF+t6666irNmDFDF154oW699VYtWLBAEyZM0I8//qh333036P8xePBg/fnPf5YkffDBB3r11Vd9Hq/pcdP7s37zzTeH/Tul0K9fOLNmzdL5558vu92uwYMHa/LkyVq0aJEn2ZKqkt1zzjlHCxcu1HXXXafOnTvrH//4h4YNGxbwfO+//74GDRqkI444QhMmTNDOnTt15ZVXqlWrVtXGEunxxe3GG29U8+bN9cADD+jbb7/VCy+8oIYNG+rrr7/WQQcdpIceekgffPCBHnvsMXXp0kVDhw71/O61117rOZeMHj1aa9eu1d///nctW7ZMX331lXJzcz37/vTTTxo8eLCuvfZaXX311frTn/4kKbLj8KRJk3TjjTeqbt26uuuuuyRJzZo1C/r3Dxo0SM8884zef/99XXTRRZ7tpaWl+uc//6nhw4crOztbUmTHiNLSUs2bN08nnXRStd+T2vr+++919tlnq2vXrho3bpzy8vK0Zs0aTwJ+6KGHaty4cbr33nt1zTXXqHfv3pKknj17SpI++eQTnXnmmerevbvuu+8+ZWVleRpIvvjiCx133HE+/99FF12kTp066aGHHgraUOrtyy+/1DvvvKNRo0apXr16euqpp3TBBRdo/fr1atKkiaSqhqMzzjhDLVq00AMPPKDKykqNGzdOBx54YKxfKiD5GAApZ9KkSUaSWbFihc/2k08+2Rx++OE+2x588EFTp04d87///c9n++23326ys7PN+vXrjTHGrF271kgyjz32WMD/d/jhh5uTTz45YPudd95pJJlt27aF3NfhcJimTZuaLl26mLKyMs/2f/3rX0aSuffeez3bhg0bZurUqRP2b2/btq2RZN5++23PtuLiYtOiRQtz9NFHe7bt27fPVFZW+vzu2rVrTV5enhk3bpxn26effmokmcGDB5smTZqY8vJyz2OdOnUyQ4YMMZLMm2++GRDnOeecY2644QbP9i+++MIUFBSYgQMHBvw
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1200x800 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAqcAAAIjCAYAAAA+xLLKAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3gVVfrA8e/M7TW9khAglFAFEUSRIqCCiN3d1XXtdd1d3XXRVdfeRVexr7u2dV0VK1YQkCaC9N7Te09ucvvMnN8fgfyMSSgaBPF8ePI8ZMq5Z+bOnbz3zDnvUYQQAkmSJEmSJEk6AqiHuwKSJEmSJEmStJcMTiVJkiRJkqQjhgxOJUmSJEmSpCOGDE4lSZIkSZKkI4YMTiVJkiRJkqQjhgxOJUmSJEmSpCOGDE4lSZIkSZKkI4YMTiVJkiRJkqQjhgxOJUmSJEmSpCOGDE4lSZKkNjZv3sxHH33U+vv69ev57LPPDl+FpJ+9Z599loaGhtbfn3rqKfx+/wHvv3LlSqxWK4WFhYegdj/cqFGjuOWWWw53NY46R1xw+tprr6EoCqtXr2637l//+heKonD22Wej6/phqJ0kSdLRr6mpiWuvvZYVK1awa9cubrzxRjZt2nS4qyX9jH3yySfcc889FBcX8+abb3LnnXficDgOeP877riDCy+8kKysrNZl48ePR1GU1p/4+HhGjBjBK6+8gmEY7cpYtGgR5557LqmpqVitVpKTk5k2bRoffPBBh6+5bds2FEXBbre3Cay/69Zbb+W5556joqLigI9F2r8jLjjtzIcffsj111/PmDFjePvttzGZTIe7SpIkSUelE044ofWnb9++VFRUcPXVVx/uakk/Y7fffjuvvvoq3bt355JLLuH+++9HVQ8sBFm/fj3z58/nuuuua7cuIyODN954gzfeeIM777wTTdO48soruf3229tsd/fdd3PyySezefNmrr32Wl588UWmT59Oc3Mz5513Hv/73//alf3f//6X1NRUAN57770O63bWWWfh9Xp5/vnnD+hYpAOjCCHE4a7Ed7322mtcfvnlrFq1iuOOOw5o+bYzefJk+vbty9KlS4mJiTnMtZQkSTr6bd26lWAwyODBg7FarYe7OtLPXENDA9u2bSMzM5OMjIwD3u/GG2/ko48+oqCgAEVRWpePHz+empoaNm/e3LosEAjQr18/6uvrqa+vx2Kx8N5773HBBRdw/vnn87///Q+LxdKm/Llz5xKNRjnjjDNalwkh6NWrF+eeey75+fnU19ezcOHCDuv3xz/+kU8++YT8/Pw29ZN+BHGEefXVVwUgVq1aJYQQYt26dcLr9YoePXqIsrKyDvfJz88XQIc/3zVjxgxxwgkniPj4eGG328Wxxx4r3n333Q7LfOONN8SIESOEw+EQsbGxYsyYMWLu3LlCCCGysrI6fT1AZGVltZaj67p48sknxYABA4TNZhPJycnimmuuEXV1dW1eLysrS0ydOlXMnTtXHHPMMcJms4n+/fuL999/f5/npyPjxo0T48aN63T9/ixYsECcdNJJwul0ipiYGHHmmWeKrVu3ttnm7rvv3uc5ePXVV1u3vfTSS4XL5RK5ubni1FNPFU6nU6SlpYl7771XGIbRptwDPV9C7Pt9z8/Pb7NtfX29uPHGG0VGRoawWq0iOztbPPLII0LX9XbldnZsl156aZvtSkpKxOWXXy6Sk5OF1WoVAwYMEC+//HKbbRYuXCiADq8zl8vVpszO3tvq6moBiLvvvrtdHaurq9uVu1dWVla7Oh/MeTiQ8mbNmtXumhei66/72tpacfPNN4tBgwYJl8slPB6PmDx5sli/fn2b7fae77ffflvcdtttIiUlRTidTjFt2jRRVFTUut3WrVuF3W4Xv/vd79rsv3TpUqGqqrjlllsO+rj3vn/fv/Y6+jyGQiFx1113iezsbGG1WkVGRoaYPn26CIVC4vv2dS/qrH5XX321sNlsYuHChW2WP/fcc2LAgAHCarWKtLQ08fvf/17U19fvt74rV67s8J7akY72f+CBB4SiKOLNN99ss/yXeO/+7vWh67oYPHhwu3vm3XffLfr37996rR9//PHiww8/bFPekiVLxPnnny8yMzNbr6GbbrpJBAKBNtvtvf9+37vvviuANtfI3s/P96+b008/vd09SIiDuwceaJkd6d69u7jsssvaLR83bpwYOHBgu+Xnn3++AERpaakQQoicnBwRHx8vfD7ffl9rr6VLlwpArFy5UrzzzjtCVVVRXFzc4bazZ88WgFi7du0Bly/tm/nHBreHUm5uLpMnT8ZmszF37lzS0tL2uf0111zDmDFjAPjggw/48MMP26yfOXMmZ555Jr/97W+JRCK8/fbbXHDBBXz66adMnTq1dbt7772Xe+65hxNPPJH77rsPq9XKt99+y1dffcWpp57KU089RXNzM9DSJ+Whhx7i9ttvp3///gC43e7Wsq699trW1uA//elP5Ofn8+yzz7Ju3TqWLVvW5hvcrl27+PWvf811113HpZdeyquvvsoFF1zAnDlzOOWUU37cyTxA8+fPZ8qUKfTq1Yt77rmHYDDIM888w+jRo1m7di09evRos/0LL7zQ5njz8/O566672pWr6zqTJ09m1KhRPPbYY8yZM4e7774bTdO47777Wrc7mPO114UXXsjpp58OwOeff85bb73VZn0gEGDcuHGUlpZy7bXX0r17d7755htuu+02ysvLeeqppzo8F2+88Ubr///85z+3WVdZWcmoUaNQFIU//OEPJCUl8cUXX3DllVfi8/m46aabOizzcPqh56EzmqZxxx13dLiuq6/7vLw8PvroIy644AJ69uxJZWUl//znPxk3bhxbt24lPT29zes/+OCDKIrCrbfeSlVVFU899RSTJk1i/fr1OBwO+vfvz/3338/06dM5//zzOfPMM/H7/Vx22WXk5OS0uSYP5rgPhGEYnHnmmXz99ddcc8019O/fn02bNvHkk0+yc+fONgOR9ncv6sjdd9/Nyy+/zDvvvMP48eNbl99zzz3ce++9TJo0ieuvv54dO3bwwgsvsGrVqk4/W3vdeuutP/h4X331Vf7+97/zxBNPcNFFF3W4zS/13v3GG2902JfX7/dzzjnn0KNHD4LBIK+99hrnnXcey5cvZ+TIkQC8++67BAIBrr/+ehISEli5ciXPPPMMJSUlvPvuu/t8Tw7GkiVL+Pzzz9st/zH3wM7K7EhpaSlFRUUce+yxB1znvLw8TCYTsbGx7Nq1i+3bt3PFFVfg8XgOuIw333yT7OxsRowYwaBBg3A6nbz11ltMnz693bbDhw8HYNmyZQwbNuyAX0Pah8MdHX/f3m+Xn376qcjOzhaAOPXUU/e5z65duwQgXn/99dZle1uWvuv73ygjkYgYNGiQmDBhQpuyVFUV55xzTrvWpO+38gnR+bdCIf7/m9f3WwvmzJnTbvneb/Tf/bbd2Ngo0tLSxLBhw1qXHeqW06FDh4rk5GRRW1vbumzDhg1CVVVxySWXtC7rrOVu1apVHbacAuKPf/xj6zLDMMTUqVOF1WptLeNgzpcQQuzcuVMA4vHHH29dNmPGjHatE/fff79wuVxi586dbfb/29/+JkwmU5sWNSGEuOOOO4SiKG2Wfb916sorrxRpaWmipqamzXa/+c1vRExMTOu1diS1nB7sedhfec8//7yw2Wzi5JNPbtPidCiu+1Ao1O7zmJ+fL2w2m7jvvvtal+093926dWvTSrK3pXPmzJmty3RdFyeddJJISUkRNTU14oYbbhBms7nd+T/Q43799dcFIPLy8trs//3P4xtvvCFUVRVLly5ts92LL74oALFs2TIhxIHfi75bv3/+858CEM8880yb7auqqoTVahWnnnpqm7KeffZZAYhXXnml0/p+/vnnAhCTJ08+6JbTzz77TJjNZnHzzTd3uO0v8d69994UCoVE9+7dxZQpU9rdM7+vqqqq3b3u++dECCEefvhhoSiKKCwsbF32Y1tOjz/++NY6fvcedLD3wAMpsyPz588XgPjkk0/arRs3bpzIyckR1dXVorq6Wmzbtk3
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 800x600 with 2 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from sklearn.cluster import AgglomerativeClustering\n",
|
|||
|
"from scipy.cluster.hierarchy import dendrogram\n",
|
|||
|
"import numpy as np\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"\n",
|
|||
|
"# Применение иерархической кластеризации\n",
|
|||
|
"hierarchical = AgglomerativeClustering(n_clusters=optimal_clusters, compute_distances=True)\n",
|
|||
|
"df['Hierarchical Cluster'] = hierarchical.fit_predict(df_scaled)\n",
|
|||
|
"\n",
|
|||
|
"# Функция для получения матрицы linkage\n",
|
|||
|
"def get_linkage_matrix(model: AgglomerativeClustering) -> np.ndarray:\n",
|
|||
|
" counts = np.zeros(model.children_.shape[0]) # type: ignore\n",
|
|||
|
" n_samples = len(model.labels_)\n",
|
|||
|
" for i, merge in enumerate(model.children_): # type: ignore\n",
|
|||
|
" current_count = 0\n",
|
|||
|
" for child_idx in merge:\n",
|
|||
|
" if child_idx < n_samples:\n",
|
|||
|
" current_count += 1\n",
|
|||
|
" else:\n",
|
|||
|
" current_count += counts[child_idx - n_samples]\n",
|
|||
|
" counts[i] = current_count\n",
|
|||
|
"\n",
|
|||
|
" return np.column_stack([model.children_, model.distances_, counts]).astype(float)\n",
|
|||
|
"\n",
|
|||
|
"# Построение дендрограммы\n",
|
|||
|
"linkage_matrix = get_linkage_matrix(hierarchical)\n",
|
|||
|
"plt.figure(figsize=(12, 8))\n",
|
|||
|
"dendrogram(linkage_matrix)\n",
|
|||
|
"plt.title(\"Дендограмма, восстановленная из модели AgglomerativeClustering\")\n",
|
|||
|
"plt.xlabel(\"Индексы объектов\")\n",
|
|||
|
"plt.ylabel(\"Евклидово расстояние\")\n",
|
|||
|
"plt.show()\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация кластеров\n",
|
|||
|
"plt.figure(figsize=(8, 6))\n",
|
|||
|
"plt.scatter(df_pca[:, 0], df_pca[:, 1], c=df['Hierarchical Cluster'], cmap='viridis', edgecolor='k', alpha=0.6)\n",
|
|||
|
"plt.title(\"Кластеры, определенные иерархической кластеризацией (PCA)\")\n",
|
|||
|
"plt.xlabel(\"Главная компонента 1\")\n",
|
|||
|
"plt.ylabel(\"Главная компонента 2\")\n",
|
|||
|
"plt.colorbar(label='Кластер')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# Оценим коэффициенты силуэтов двух методов кластеризации"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 31,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Коэффициент силуэта для K-means: 0.0405\n",
|
|||
|
"Коэффициент силуэта для иерархической кластеризации: 0.3230\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Оценка качества\n",
|
|||
|
"silhouette_kmeans = silhouette_score(df_scaled, df['Cluster'])\n",
|
|||
|
"silhouette_hierarchical = silhouette_score(df_scaled, df['Hierarchical Cluster'])\n",
|
|||
|
"\n",
|
|||
|
"print(f\"Коэффициент силуэта для K-means: {silhouette_kmeans:.4f}\")\n",
|
|||
|
"print(f\"Коэффициент силуэта для иерархической кластеризации: {silhouette_hierarchical:.4f}\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# Из полученных данных видно, что коэффициент силуэта для иерархической кластеризации лучше, он ближе к 1. А вот результат для K-means значительно хуже. Границы кластеров размыты сильнее, чем в иерархическом методе. В ходе экспериментов с количеством кластеров для k-meansвыяснилосб, что 0.0405- лучшее значение"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": ".venv",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.12.6"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|