AIM-PIbd-32-Isaeva-A-I/lab_5/Lab5.ipynb

337 lines
248 KiB
Plaintext
Raw Normal View History

2024-12-21 01:02:44 +04:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Лабораторная 5"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['stock index', 'country', 'year', 'index price', 'log_indexprice',\n",
" 'inflationrate', 'oil prices', 'exchange_rate', 'gdppercent',\n",
" 'percapitaincome', 'unemploymentrate', 'manufacturingoutput',\n",
" 'tradebalance', 'USTreasury'],\n",
" dtype='object')\n",
" stock index country year index price log_indexprice \\\n",
"0 NASDAQ United States of America 1980.0 168.61 2.23 \n",
"1 NASDAQ United States of America 1981.0 203.15 2.31 \n",
"2 NASDAQ United States of America 1982.0 188.98 2.28 \n",
"3 NASDAQ United States of America 1983.0 285.43 2.46 \n",
"4 NASDAQ United States of America 1984.0 248.89 2.40 \n",
"\n",
" inflationrate oil prices exchange_rate gdppercent percapitaincome \\\n",
"0 0.14 21.59 1.0 0.09 12575.0 \n",
"1 0.10 31.77 1.0 0.12 13976.0 \n",
"2 0.06 28.52 1.0 0.04 14434.0 \n",
"3 0.03 26.19 1.0 0.09 15544.0 \n",
"4 0.04 25.88 1.0 0.11 17121.0 \n",
"\n",
" unemploymentrate manufacturingoutput tradebalance USTreasury \n",
"0 0.07 NaN -13.06 0.11 \n",
"1 0.08 NaN -12.52 0.14 \n",
"2 0.10 NaN -19.97 0.13 \n",
"3 0.10 NaN -51.64 0.11 \n",
"4 0.08 NaN -102.73 0.12 \n"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.decomposition import PCA\n",
"from sklearn.cluster import KMeans\n",
"from scipy.cluster.hierarchy import dendrogram, linkage, fcluster\n",
"from sklearn.metrics import silhouette_score\n",
"\n",
"df = pd.read_csv(\".//csv//EconomicData.csv\")\n",
"print(df.columns)\n",
"print(df.head())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Бизнес-цель: сегментировать страны на основе экономических показателей для определения схожих групп стран и последующего анализа каждой группы."
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Данные содержат текстовые значения.\n",
"Исходный размер датасета: 369\n",
"Очищенный размер датасета: 219\n"
]
}
],
"source": [
"df = df.copy()\n",
"df_clean = df.dropna()\n",
"\n",
"if not np.issubdtype(df_clean.dtypes.iloc[1], np.number):\n",
" print(\"Данные содержат текстовые значения.\")\n",
" cleaned_data = df_clean.select_dtypes(include=[np.number])\n",
"\n",
"print(f\"Исходный размер датасета: {df.shape[0]}\")\n",
"print(f\"Очищенный размер датасета: {df_clean.shape[0]}\")\n",
"\n",
"df = pd.get_dummies(df_clean, columns=['country'], drop_first=True)"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"# Выбор признаков для кластеризации\n",
"features = ['index price', 'inflationrate', 'oil prices', 'exchange_rate', 'gdppercent', \n",
" 'percapitaincome', 'unemploymentrate', 'manufacturingoutput', 'tradebalance', 'USTreasury']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Стандартизируем, чтобы устранить влияние масштаба.\n",
"А также понизим размерность с помощью РСА для уменьшения количества признаков для визуализации данных."
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"# Предобработка данных: стандартизация\n",
"scaler = StandardScaler()\n",
"scaled_data = scaler.fit_transform(df[features])\n",
"\n",
"# Понижение размерности с помощью PCA\n",
"pca = PCA(n_components=2)\n",
"pca_data = pca.fit_transform(scaled_data)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Используем метод главных компонент (PCA) для уменьшения размерности данных до 2D для визуализации."
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA0wAAAIjCAYAAAAwSJuMAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3gU1dcH8O/MbE1PIAVI6L33ItK79I4VsCEgiKgv2MWGIgg/EKzYAZGOKL0jIr0rHRJKSAjp2TI7M+8fkUhMNtkN2dTv53l4lN1zZ84GSPbsvfdcQdM0DURERERERJSFWNgJEBERERERFVUsmIiIiIiIiJxgwUREREREROQECyYiIiIiIiInWDARERERERE5wYKJiIiIiIjICRZMRERERERETrBgIiIiIiIicoIFExERERERkRMsmIiIiP5j1KhRqFy5cmGnkSeXL1+GIAj49ttvCzsVIqISgQUTEVEx9u2330IQBBw8eDDT44mJiWjZsiVMJhM2bNiQ41hBELBnz54sz2uahoiICAiCgD59+ngk/4KWlJSEadOmoVGjRvDx8YHZbEb9+vUxZcoUXL9+vcDyWLBgAQsaIqJiQlfYCRARUf5KSkpC9+7dcfz4caxatQo9e/bMMd5kMmHx4sW4//77Mz2+c+dOXL16FUaj0ZPpFpiLFy+ia9euiIyMxNChQ/H000/DYDDg+PHjWLhwIVatWoWzZ88WSC4LFixA2bJlMWrUqHy/dqVKlWCxWKDX6/P92kREpRELJiKiEiQ5ORk9evTA0aNHsXLlSvTq1SvXMQ888ACWLVuGuXPnQqf798fC4sWL0axZM9y6dcuTKRcIh8OBQYMG4ebNm9ixY0eW4vC9997Dhx9+WEjZ5Q+HwwFVVWEwGGAymQo7HSKiEoNL8oiISoiUlBT07NkThw8fxooVK9C7d2+Xxj344IOIi4vD5s2bMx6z2+1Yvnw5HnrooWzHqKqKOXPmoF69ejCZTAgNDcWYMWMQHx+fKW7NmjXo3bs3ypcvD6PRiGrVquGdd96BoiiZ4jp27Ij69evj9OnT6NSpE7y8vFChQgXMmDEjy73nzZuHevXqwcvLC4GBgWjevDkWL16c42tcsWIFjh07hldffTVLsQQAfn5+eO+995yO37FjBwRBwI4dOzI9nt1+oejoaIwePRrh4eEwGo0oV64c+vfvj8uXLwMAKleujFOnTmHnzp0ZSyI7duyYMT4hIQGTJk1CREQEjEYjqlevjg8//BCqqma578yZMzFnzhxUq1YNRqMRp0+fzjanUaNGwcfHB9euXcOAAQPg4+OD4OBgvPjii1n+LOLi4vDoo4/Cz88PAQEBGDlyJI4dO8Z9UURUanGGiYioBEhNTUWvXr1w4MABLF++3K09R5UrV0abNm2wZMmSjBmp9evXIzExESNGjMDcuXOzjBkzZgy+/fZbjB49GhMnTsSlS5fwySef4MiRI/j9998zloN9++238PHxweTJk+Hj44Nt27bhjTfeQFJSEj766KNM14yPj0fPnj0xaNAgDBs2DMuXL8eUKVPQoEGDjLy+/PJLTJw4EUOGDMFzzz0Hq9WK48eP488//3Ra3AHA2rVrAQCPPvqoy1+XvBo8eDBOnTqFCRMmoHLlyoiJicHmzZsRGRmJypUrY86cOZgwYQJ8fHzw6quvAgBCQ0MBAGlpaejQoQOuXbuGMWPGoGLFiti7dy9efvll3LhxA3PmzMl0r2+++QZWqxVPP/00jEYjgoKCMhVWd1MUBT169ECrVq0wc+ZMbNmyBbNmzUK1atUwduxYAOmFcN++fbF//36MHTsWtWvXxpo1azBy5EjPfcGIiIo6jYiIiq1vvvlGA6BVqlRJ0+v12urVq90ee+DAAe2TTz7RfH19tbS0NE3TNG3o0KFap06dNE3TtEqVKmm9e/fOGLd7924NgLZo0aJM19uwYUOWx+9c725jxozRvLy8NKvVmvFYhw4dNADa999/n/GYzWbTwsLCtMGDB2c81r9/f61evXouv8Y7mjRpovn7+7scP3LkSK1SpUoZv9++fbsGQNu+fXumuEuXLmkAtG+++UbTNE2Lj4/XAGgfffRRjtevV6+e1qFDhyyPv/POO5q3t7d29uzZTI9PnTpVkyRJi4yMzHRfPz8/LSYmJsec7rweANrbb7+dKbZJkyZas2bNMn6/YsUKDYA2Z86cjMcURdE6d+6c5ZpERKUFl+QREZUAN2/ehMlkQkRERJ7GDxs2DBaLBevWrUNycjLWrVvndMZm2bJl8Pf3R7du3XDr1q2MX82aNYOPjw+2b9+eEWs2mzP+Pzk5Gbdu3UK7du2QlpaGv//+O9N1fXx88Mgjj2T83mAwoGXLlrh48WLGYwEBAbh69SoOHDjg1utLSkqCr6+vW2Pywmw2w2AwYMeOHVmWJ7pi2bJlaNeuHQIDAzN9bbt27QpFUbBr165M8YMHD0ZwcLDL13/mmWcy/b5du3aZvr4bNmyAXq/HU089lfGYKIoYP36826+FiKikYMFERFQCfP755zAYDOjZsyfOnDmT8biiKIiOjs70y263ZxkfHByMrl27YvHixVi5ciUURcGQIUOyvde5c+eQmJiIkJAQBAcHZ/qVkpKCmJiYjNhTp05h4MCB8Pf3h5+fH4KDgzOKosTExEzXDQ8PhyAImR4LDAzMVHhMmTIFPj4+aNmyJWrUqIHx48fj999/z/Xr4+fnh+Tk5Fzj7pXRaMSHH36I9evXIzQ0FO3bt8eMGTMQHR3t0vhz585hw4YNWb6uXbt2BYBMX1sAqFKlisu5mUymLMXVf7++V65cQbly5eDl5ZUprnr16i7fh4iopOEeJiKiEqBu3br47bff0KVLF3Tr1g2///47IiIiEBUVleVN9fbt2zM1GbjjoYcewlNPPYXo6Gj06tULAQEB2d5LVVWEhIRg0aJF2T5/5015QkICOnToAD8/P7z99tuoVq0aTCYTDh8+jClTpmTZayNJUrbX0zQt4//r1KmDM2fOYN26ddiwYQNWrFiBBQsW4I033sC0adOcfXlQu3ZtHDlyBFFRUXmahftvIXfHfxsmAMCkSZPQt29frF69Ghs3bsTrr7+O6dOnY9u2bWjSpEmO91FVFd26dcP//d//Zft8zZo1M/3+7hm83Dj7+hIRUc5YMBERlRAtW7bE6tWr0bt3b3Tr1g27d+9GWFhYpu53ANCoUaNsxw8cOBBjxozBvn37sHTpUqf3qVatGrZs2YK2bdvm+IZ9x44diIuLw8qVK9G+ffuMxy9duuTmK8vM29sbw4cPx/Dhw2G32zFo0CC89957ePnll5220+7bty+WLFmCH3/8ES+//LLb9wwMDASQXgTe7cqVK9nGV6tWDS+88AJeeOEFnDt3Do0bN8asWbPw448/AnBegFWrVg0pKSkZM0oFrVKlSti+fTvS0tIyzTKdP3++UPIhIioKuCSPiKgE6dKlC5YsWYLz58+jZ8+esNvt6Nq1a6Zfd978/5ePjw8+/fRTvPXWW+jbt6/TewwbNgyKouCdd97J8pzD4cgoKu7MaNw9Q2S327FgwYI8v764uLhMvzcYDKhbty40TYMsy07HDRkyBA0aNMB7772HP/74I8vzycnJGR3rslOpUiVIkpRlD9F/X0taWhqsVmumx6pVqwZfX1/YbLaMx7y9vbMUX0D61/aPP/7Axo0bszyXkJAAh8PhNMf80KNHD8iyjC+//DLjMVVVMX/+fI/el4ioKOMMExFRCTNw4EB8+eWXePzxx9GvXz9s2LDB5YNMXWkf3aFDB4wZMwbTp0/H0aNH0b17d+j1epw7dw7Lli3D//73PwwZMgT33XcfAgMDMXLkSEycOBGCIOCHH37IVEC5q3v37ggLC0Pbtm0RGhqKv/76C5988gl69+6dY1MHvV6PlStXomvXrmjfvj2GDRuGtm3bQq/X49SpU1i8eDECAwOdnsXk7++PoUOHYt68eRAEAdWqVcO6deuy7Ck6e/YsunTpgmHDhqFu3brQ6XRYtWoVbt68iREjRmTENWvWDJ9++ineffddVK9eHSEhIejcuTNeeuk
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Неиерархическая\n",
"kmeans = KMeans(n_clusters=3, random_state=42)\n",
"kmeans_labels = kmeans.fit_predict(scaled_data)\n",
"\n",
"# Визуализация кластеров K-Means\n",
"plt.figure(figsize=(10, 6))\n",
"sns.scatterplot(x=pca_data[:, 0], y=pca_data[:, 1], hue=kmeans_labels, palette='inferno', s=100)\n",
"plt.title('K-Means Clustering')\n",
"plt.xlabel('Количество кластеров')\n",
"plt.ylabel('Инерция')\n",
"plt.show()\n",
"\n",
"#оценка неиерархического\n",
"silhouette_avg_kmeans = silhouette_score(scaled_data, kmeans_labels)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Для иерархической кластеризации потребуется предварительно определить количество кластеров, так как она не возвращает метки кластеров."
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAzcAAAJcCAYAAADElVr3AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACFTklEQVR4nO3dd3gU5cLG4WdTSUIvoQakVwFBpCqoIFUBBUS6SBFBKSLCEQsKAsoRFAUEFEHAQvUcUBREUBSQIgIWmigBBKSXdDLfH/l2zm6ySXaTTbKZ/O7r4mI3Mzvz7uzszDz7lrEZhmEIAAAAAHI5v5wuAAAAAAB4A+EGAAAAgCUQbgAAAABYAuEGAAAAgCUQbgAAAABYAuEGAAAAgCUQbgAAAABYAuEGAAAAgCUQbgAAAABYAuEGAAAAgCUQbgDAR33wwQey2WzavXt3imkDBgyQzWZTnTp1cqBkAAD4JsINAOQyR48e1dKlS3O6GAAA+JyAnC4AAMAzU6ZMUWBgoKpUqZLTRQEAwKdQcwMAucixY8e0dOlSDR06VKVKlUoxfenSpWrYsKFCQkJUtGhR9ezZU5GRkU7ztGrVSnXq1NGePXvUrFkzhYSEqGLFipo3b57TfHFxcXrhhRfUsGFDFSpUSGFhYbrzzjv1zTffOM33559/ymazacaMGZo5c6YqVKigkJAQtWzZUgcPHjTnO3funEqUKKFWrVrJMAzz70ePHlVYWJgefvhhpzK2atXKaT27du2SzWaTzWYz/7ZlyxbZbDZt2bLFad4BAwbolltucfpbYmKiZs2apdq1aytfvnwqWbKkhg4dqkuXLqXYjl988YVatmypAgUKqGDBgmrUqJGWL1+eZvmmTJkiPz8/p/m+++47de/eXeXLl1dwcLAiIiI0evRoRUdHp1jnypUrdfvtt6tAgQLm+7RvVwCAewg3AJCLTJ48WQEBAXr22WdTTJsyZYr69eunqlWr6o033tCoUaP09ddf66677tLly5ed5r106ZI6dOighg0b6rXXXlO5cuU0bNgwvf/+++Y8V69e1cKFC9WqVStNnz5dL730kv755x+1bdtW+/btS7H+JUuW6K233tLw4cM1YcIEHTx4UPfcc4/Onj0rSQoPD9fcuXO1detWzZ49W1JS4BgwYIAKFCigOXPmpPneXb1nTwwdOlTPPPOMmjdvrjfffFOPPvqoli1bprZt2yo+Pt6c74MPPlDHjh118eJFTZgwQdOmTVP9+vW1YcOGVJe9aNEiTZw4UTNmzFCvXr3Mv69YsUJRUVEaNmyYZs+erbZt22r27Nnq16+f0+u3b9+uHj166ObNm5o2bZo+/PBDzZw5M1PvFwDyJAMA4JMWLVpkSDJ27dplGIZhHDt2zAgICDCeeuopwzAMo2XLlkbt2rUNwzCMP//80/D39zemTJnitIwDBw4YAQEBTn9v2bKlIcn497//bf4tNjbWqF+/vhEeHm7ExcUZhmEYCQkJRmxsrNPyLl26ZJQsWdIYOHCg+bfjx48bkoyQkBDj5MmT5t937txpSDJGjx7ttIxHHnnECA0NNQ4fPmy8/vrrhiRj7dq1TvO0bNnSaNmypfn8888/NyQZ7dq1MxxPXVu3bjUkGZs3b3Z6ff/+/Y0KFSqYz7/77jtDkrFs2TKn+TZs2OD098uXLxsFChQwGjdubERHRzvNm5iY6LJ869evNwICAoynn37aSC4qKirF36ZOnWrYbDbjr7/+Mv82YcIEQ5Lx999/m3+zb9fXX389xTIAAK5RcwMAuYS91mb8+PEppq1evVqJiYnq0aOHzp8/b/4rVaqUqlatmqIpWUBAgIYOHWo+DwoK0tChQ3Xu3Dnt2bNHkuTv76+goCBJSTUsFy9eVEJCgm6//Xbt3bs3RRm6dOmismXLms/vuOMONW7cWJ9//rnTfG+//bYKFSqkbt266fnnn1ffvn3VuXPnVN+3YRiaMGGCHnroITVu3NhpWnh4uCTp5MmTqb5eSqpBKVSokNq0aeO0fRo2bKj8+fOb22fjxo26du2axo8fr3z58jktw7E5nN2PP/6oHj166KGHHtLrr7+eYnpISIj5+MaNGzp//ryaNWsmwzD0008/mdOuXbsmPz8/FS5cOM33AQBIG+EGAHKBP/74Qx9++KGGDBmi0qVLp5h+5MgRGYahqlWrqkSJEk7/fvvtN507d85p/jJlyigsLMzpb9WqVZOU1IfGbvHixapbt67y5cunYsWKqUSJElq/fr2uXLmSogxVq1ZN8bdq1ao5LU+SihYtqrfeekv79+9XoUKF9NZbb6X53pctW6ZffvlFr776aopplSpVUqlSpTRjxgzt37/fDC2xsbEpts+VK1cUHh6eYvtcv37d3D7Hjh2TJLeG2D516pQ6duyoGzdu6MKFCy7Dz4kTJzRgwAAVLVpU+fPnV4kSJdSyZUtJctqGTZs2VWJiokaOHKljx47p/PnzLvsCAQDSxmhpAJALTJkyJdW+NlJSzYrNZtMXX3whf3//FNPz58/v8TqXLl2qAQMGqEuXLnrmmWcUHh4uf39/TZ061QwBGfXll19KSur7c/LkyVRrLOLi4vT888/rscceM8OXo6CgIC1YsEC9evVSvXr1nKZVqFDBfJyYmKjw8HAtW7bM5XpKlCjh8Xs4evSoGjRooJkzZ6pv375avHix+vfvb06/efOm2rRpo4sXL+rZZ59VjRo1FBYWplOnTmnAgAFKTEw05+3Zs6f27t2r2bNna/78+R6XBQCQhHADAD7u+PHjWrJkiYYNG6YyZcq4nKdy5coyDEMVK1Z0GQKSO336tG7cuOFUe3P48GFJMkcZW7lypSpVqqTVq1c71Uq8+OKLLpd55MiRFH87fPhwilHLNmzYoIULF2rcuHFatmyZ+vfvr507dyogIOUpac6cOTp37pxeeumlVN9Lp06ddOrUKe3fv98chez111/XoUOHzHkqV66sTZs2qXnz5k5NxZKrXLmyJOngwYPpDrVdunRpff755ypZsqQ+++wzPf300+rQoYMZlA4cOKDDhw9r8eLFTgMIbNy4McWy/Pz8NGPGDB04cEDHjx/XnDlzdPbsWfXp0yfNMgAAnNEsDQB83Kuvvip/f3+XfW3sHnzwQfn7+2vSpElOwyxLSX1WLly44PS3hIQEvfvuu+bzuLg4vfvuuypRooQaNmwoSWYNkOPydu7cqe3bt7ssw9q1a3Xq1Cnz+Y8//qidO3eqffv25t8uX76sQYMG6Y477tCrr76qhQsXau/evS6bnF27dk1TpkzR6NGjXQ577ahAgQJq3ry5WrdurdatW6doumcfieyVV15J8dqEhARzNLn77rtPBQoU0NSpUxUTE+M0X/LtWq1aNZUsWVKSNHv2bLNZmZ2r7WcYht58802X72H27NnavHmzli1bptatW6t58+ZpvmcAQErU3ACAj9u3b59GjBiRaq2NlFTjMHnyZE2YMEF//vmnunTpogIFCuj48eNas2aNhgwZorFjx5rzlylTRtOnT9eff/6patWq6ZNPPtG+ffs0f/58BQYGSkqqEVm9erW6du2qjh076vjx45o3b55q1aql69evpyhDlSpV1KJFCw0bNkyxsbGaNWuWihUrpnHjxpnzjBw5UhcuXNCmTZvk7++vdu3aadCgQZo8ebI6d+7s1LRs7969Kl68uNPrM6ply5YaOnSopk6dqn379um+++5TYGCgjhw5ohUrVujNN99Ut27dVLBgQc2cOVODBg1So0aN1KtXLxUpUkQ///yzoqKitHjxYpfLL1WqlF5//XUNGjRIffr0UYcOHVSjRg1VrlxZY8eO1alTp1SwYEGtWrXKZV+aX375RePGjdNLL72kRo0aZfr9AkCelVPDtAEA0mYfCjo4ONhpiGU7x6Gg7VatWmW0aNHCCAsLM8LCwowaNWoYw4cPNw4dOpTidbt37zaaNm1q5MuXz6hQoYLx9ttvOy0rMTHRePXVV40KFSoYwcHBxm233WasW7cuxTDLjkMW//vf/zYiIiKM4OBg48477zR+/vlnc77PPvssxRDUhmEYV69eNSpUqGDUq1f
"text/plain": [
"<Figure size 1000x700 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Иерархическая кластеризация\n",
"linked = linkage(scaled_data, 'ward')\n",
"\n",
"# Визуализация\n",
"plt.figure(figsize=(10, 7))\n",
"dendrogram(linked, orientation='top', distance_sort='descending', show_leaf_counts=True)\n",
"plt.title('Иерархическая')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [],
"source": [
"# Определение меток\n",
"n_clusters = 3\n",
"hierarchical_labels = fcluster(linked, n_clusters, criterion='maxclust')"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA0wAAAIjCAYAAAAwSJuMAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3xUVfrH8c+901JIoYXeiwiIUhSwgYooVhR11bWvZRUrtnV3f5Z1Fdfee1kLYkfRVbCLvYBYUGkC0jtJSJl2z++POwmEtJmQSf2+Xy9eu5l7zr1nJjG5z33OeY5ljDGIiIiIiIhIOXZ9D0BERERERKShUsAkIiIiIiJSCQVMIiIiIiIilVDAJCIiIiIiUgkFTCIiIiIiIpVQwCQiIiIiIlIJBUwiIiIiIiKVUMAkIiIiIiJSCQVMIiIiIiIilVDAJCIiIiIiUgkFTCIizcx///tfLMviu+++K3fsjDPOwLIsBg4cWA8jExERaXgUMImICACLFi3iueeeq+9hiIiINCje+h6AiIg0DDfddBM+n4/evXvX91BEREQaDGWYRESExYsX89xzz3HeeefRvn37csefe+45hg4dSmpqKq1ateLEE09k+fLlZdqMHj2agQMHMnv2bPbee29SU1Pp0aMHDz/8cJl2oVCIa6+9lqFDh5KVlUV6ejr77bcfH330UZl2S5cuxbIsbr/9du666y66detGamoqo0aN4ueffy5tt27dOtq2bcvo0aMxxpS+vmjRItLT0/nTn/5UZoyjR48uc51vv/0Wy7KwLKv0tY8//hjLsvj444/LtD3jjDPo3r17mdccx+Huu+9mwIABpKSk0K5dO8477zw2b95c7nN85513GDVqFBkZGWRmZrLnnnvy/PPPVzm+m266Cdu2y7T79NNPOf744+natSuBQIAuXbpw2WWXUVRUVO6ar7zyCsOGDSMjI6P0fZZ8riIiUj0FTCIiwr///W+8Xi9XX311uWM33XQTp512Gn369OHOO+/k0ksv5YMPPmD//fdny5YtZdpu3ryZww47jKFDh3LrrbfSuXNnzj//fJ588snSNnl5eTz++OOMHj2a//znP1x//fWsX7+eQw45hLlz55a7/jPPPMO9997LxIkTueaaa/j555858MADWbt2LQA5OTk89NBDfPLJJ9x3332AG8ScccYZZGRk8OCDD1b53it6z4k477zzuPLKK9lnn3245557OPPMM5kyZQqHHHII4XC4tN1///tfDj/8cDZt2sQ111zDLbfcwh577MGMGTMqPfdTTz3FP//5T26//XZOPvnk0tdffvllCgsLOf/887nvvvs45JBDuO+++zjttNPK9P/yyy854YQTiEaj3HLLLTz77LPcddddO/V+RUSaHSMiIs3KU089ZQDz7bffGmOMWbx4sfF6vebiiy82xhgzatQoM2DAAGOMMUuXLjUej8fcdNNNZc7x008/Ga/XW+b1UaNGGcDccccdpa8Fg0Gzxx57mJycHBMKhYwxxkQiERMMBsucb/PmzaZdu3bmrLPOKn1tyZIlBjCpqalmxYoVpa9//fXXBjCXXXZZmXOcdNJJJi0tzSxYsMDcdtttBjCvv/56mTajRo0yo0aNKv367bffNoA59NBDzfZ/Ej/55BMDmA8//LBM/9NPP91069at9OtPP/3UAGbKlCll2s2YMaPM61u2bDEZGRlm+PDhpqioqExbx3EqHN///vc/4/V6zeWXX252VFhYWO61yZMnG8uyzLJly0pfu+aaawxgVq9eXfpayed62223lTuHiIiUpwyTiEgzV5Jd+tvf/lbu2GuvvYbjOJxwwgls2LCh9F/79u3p06dPuWl0Xq+X8847r/Rrv9/Peeedx7p165g9ezYAHo8Hv98PuJmgTZs2EYlEGDZsGHPmzCk3hvHjx9OpU6fSr/faay+GDx/O22+/Xabd/fffT1ZWFscddxz/93//x6mnnsrRRx9d6fs2xnDNNdcwYcIEhg8fXuZYTk4OACtWrKi0P7iZnqysLA4++OAyn8/QoUNp0aJF6efz3nvvkZ+fz9/+9jdSUlLKnGP7qYAlvvnmG0444QQmTJjAbbfdVu54ampq6f8vKChgw4YN7L333hhj+P7770uP5efnY9s22dnZVb4PERGpnAImEZFm7Pfff+fZZ5/l3HPPpUOHDuWOL1y4EGMMffr0oW3btmX+/frrr6xbt65M+44dO5Kenl7mtb59+wLumqQSTz/9NIMGDSIlJYXWrVvTtm1b/ve//5Gbm1tuDH369Cn3Wt++fcucD6BVq1bce++9/Pjjj2RlZXHvvfdW+d6nTJnCvHnzuPnmm8sd69mzJ+3bt+f222/nxx9/LA2EgsFguc8nNzeXnJyccp/P1q1bSz+fxYsXA8RVrn3lypUcfvjhFBQUsHHjxgoDqj/++IMzzjiDVq1a0aJFC9q2bcuoUaMAynyGI0eOxHEcLrnkEhYvXsyGDRsqXFslIiKVU5U8EZFm7Kabbqp07RK4GSDLsnjnnXfweDzljrdo0SLhaz733HOcccYZjB8/niuvvJKcnBw8Hg+TJ08uDSxqaubMmYC7lmrFihWVZlZCoRD/93//x1/+8pfSgG57fr+fxx57jJNPPpndd9+9zLFu3bqV/n/HccjJyWHKlCkVXqdt27YJv4dFixYxZMgQ7rrrLk499VSefvppTj/99NLj0WiUgw8+mE2bNnH11VfTr18/0tPTWblyJWeccQaO45S2PfHEE5kzZw733Xcfjz76aMJjERERBUwiIs3WkiVLeOaZZzj//PPp2LFjhW169eqFMYYePXpUGFjsaNWqVRQUFJTJMi1YsACgtLrcK6+8Qs+ePXnttdfKZE+uu+66Cs+5cOHCcq8tWLCgXLW6GTNm8Pjjj3PVVVcxZcoUTj/9dL7++mu83vJ/6h588EHWrVvH9ddfX+l7OeKII1i5ciU//vhjafW52267jfnz55e26dWrF++//z777LNPmWlyO+rVqxcAP//8c7Vl2zt06MDbb79Nu3bteOONN7j88ss57LDDSoOvn376iQULFvD000+XKfLw3nvvlTuXbdvcfvvt/PTTTyxZsoQHH3yQtWvXcsopp1Q5BhER2UZT8kREmqmbb74Zj8dT4dqlEsceeywej4cbbrihTMlucNcAbdy4scxrkUiERx55pPTrUCjEI488Qtu2bRk6dChAaaZq+/N9/fXXfPnllxWO4fXXX2flypWlX3/zzTd8/fXXjBs3rvS1LVu2cPbZZ7PXXntx88038/jjjzNnzpwKp9vl5+dz0003cdlll1VYQn17GRkZ7LPPPowZM4YxY8aUm7ZYUoHuxhtvLNc3EomUVhEcO3YsGRkZTJ48meLi4jLtdvxc+/btS7t27QC47777SqfUlajo8zPGcM8991T4Hu677z4+/PBDpkyZwpgxY9hnn32qfM8iIlKWMkwiIs3U3LlzufDCCyvNLoGbGfn3v//NNddcw9KlSxk/fjwZGRksWbKEadOmce6553LFFVeUtu/YsSP/+c9/WLp0KX379uXFF19k7ty5PProo/h8PsDN3Lz22mscc8wxHH744SxZsoSHH36Y/v37s3Xr1nJj6N27N/vuuy/nn38+wWCQu+++m9atW3PVVVeVtrnkkkvYuHEj77//Ph6Ph0MPPZSzzz6bf//73xx99NFlptXNmTOHNm3alOlfU6NGjeK8885j8uTJzJ07l7Fjx+Lz+Vi4cCEvv/wy99xzD8cddxyZmZncddddnH322ey5556cfPLJtGzZkh9++IHCwkKefvrpCs/fvn17brvtNs4++2xOOeUUDjvsMPr160evXr244oorWLlyJZmZmbz66qsVrk2aN28eV111Fddffz177rnnTr9fEZFmqb7K84mISP0oKSseCATKlOsusX1Z8RKvvvqq2XfffU16erpJT083/fr1MxMnTjTz588v1++7774zI0eONCkpKaZbt27m/vvvL3Mux3HMzTffbLp162YCgYAZPHiweeutt8qV7N6+/PUdd9xhunTpYgKBgNlvv/3MDz/8UNrujTfeKFfO3Bh
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
" # Визуализация кластеров\n",
"plt.figure(figsize=(10, 6))\n",
"sns.scatterplot(x=pca_data[:, 0], y=pca_data[:, 1], hue=hierarchical_labels, palette='inferno', s=100)\n",
"plt.title('Иерархическая')\n",
"plt.xlabel('PC1')\n",
"plt.ylabel('PC2')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" year index price log_indexprice inflationrate oil prices \\\n",
"Cluster \n",
"0 1998.054545 13563.522364 3.929091 0.054182 34.765091 \n",
"1 2005.619048 7237.508776 3.713401 0.020680 48.031361 \n",
"2 2009.294118 3554.822941 3.494118 0.022941 59.845294 \n",
"\n",
" exchange_rate gdppercent percapitaincome unemploymentrate \\\n",
"Cluster \n",
"0 85.857273 0.045818 7502.927273 0.061818 \n",
"1 6.610340 0.029320 27037.510204 0.077823 \n",
"2 1.000000 0.025294 49157.352941 0.058235 \n",
"\n",
" manufacturingoutput tradebalance USTreasury \n",
"Cluster \n",
"0 132.100000 -6.739455 0.063636 \n",
"1 473.491633 34.495510 0.042993 \n",
"2 251.887059 -555.851765 0.035294 \n"
]
}
],
"source": [
"# Добавление меток кластеров в исходный датафрейм\n",
"df['Cluster'] = kmeans_labels\n",
"\n",
"# Удаление нечисловых столбцов перед вычислением среднего\n",
"numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns\n",
"cluster_analysis = df.groupby('Cluster')[numeric_columns].mean()\n",
"\n",
"# Вывод результата\n",
"print(cluster_analysis)"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Оценка для неиерархического: 0.20251\n",
"Оценка для иерархического: 0.20251\n"
]
}
],
"source": [
"# Оценка\n",
"print(f\"Оценка для неиерархического: {round(silhouette_avg_kmeans,5)}\")\n",
"\n",
"silhouette_avg = silhouette_score(scaled_data, kmeans_labels)\n",
"print(f\"Оценка для иерархического: {round(silhouette_avg,5)}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Оценки совпадают, потому что, вероятно, для расхождения в оценке нужно большее число различных данных."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Scripts",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}