337 lines
248 KiB
Plaintext
337 lines
248 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Лабораторная 5"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 47,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Index(['stock index', 'country', 'year', 'index price', 'log_indexprice',\n",
|
|||
|
" 'inflationrate', 'oil prices', 'exchange_rate', 'gdppercent',\n",
|
|||
|
" 'percapitaincome', 'unemploymentrate', 'manufacturingoutput',\n",
|
|||
|
" 'tradebalance', 'USTreasury'],\n",
|
|||
|
" dtype='object')\n",
|
|||
|
" stock index country year index price log_indexprice \\\n",
|
|||
|
"0 NASDAQ United States of America 1980.0 168.61 2.23 \n",
|
|||
|
"1 NASDAQ United States of America 1981.0 203.15 2.31 \n",
|
|||
|
"2 NASDAQ United States of America 1982.0 188.98 2.28 \n",
|
|||
|
"3 NASDAQ United States of America 1983.0 285.43 2.46 \n",
|
|||
|
"4 NASDAQ United States of America 1984.0 248.89 2.40 \n",
|
|||
|
"\n",
|
|||
|
" inflationrate oil prices exchange_rate gdppercent percapitaincome \\\n",
|
|||
|
"0 0.14 21.59 1.0 0.09 12575.0 \n",
|
|||
|
"1 0.10 31.77 1.0 0.12 13976.0 \n",
|
|||
|
"2 0.06 28.52 1.0 0.04 14434.0 \n",
|
|||
|
"3 0.03 26.19 1.0 0.09 15544.0 \n",
|
|||
|
"4 0.04 25.88 1.0 0.11 17121.0 \n",
|
|||
|
"\n",
|
|||
|
" unemploymentrate manufacturingoutput tradebalance USTreasury \n",
|
|||
|
"0 0.07 NaN -13.06 0.11 \n",
|
|||
|
"1 0.08 NaN -12.52 0.14 \n",
|
|||
|
"2 0.10 NaN -19.97 0.13 \n",
|
|||
|
"3 0.10 NaN -51.64 0.11 \n",
|
|||
|
"4 0.08 NaN -102.73 0.12 \n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"import numpy as np\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"from sklearn.preprocessing import StandardScaler\n",
|
|||
|
"from sklearn.decomposition import PCA\n",
|
|||
|
"from sklearn.cluster import KMeans\n",
|
|||
|
"from scipy.cluster.hierarchy import dendrogram, linkage, fcluster\n",
|
|||
|
"from sklearn.metrics import silhouette_score\n",
|
|||
|
"\n",
|
|||
|
"df = pd.read_csv(\".//csv//EconomicData.csv\")\n",
|
|||
|
"print(df.columns)\n",
|
|||
|
"print(df.head())"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Бизнес-цель: сегментировать страны на основе экономических показателей для определения схожих групп стран и последующего анализа каждой группы."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 48,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Данные содержат текстовые значения.\n",
|
|||
|
"Исходный размер датасета: 369\n",
|
|||
|
"Очищенный размер датасета: 219\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"df = df.copy()\n",
|
|||
|
"df_clean = df.dropna()\n",
|
|||
|
"\n",
|
|||
|
"if not np.issubdtype(df_clean.dtypes.iloc[1], np.number):\n",
|
|||
|
" print(\"Данные содержат текстовые значения.\")\n",
|
|||
|
" cleaned_data = df_clean.select_dtypes(include=[np.number])\n",
|
|||
|
"\n",
|
|||
|
"print(f\"Исходный размер датасета: {df.shape[0]}\")\n",
|
|||
|
"print(f\"Очищенный размер датасета: {df_clean.shape[0]}\")\n",
|
|||
|
"\n",
|
|||
|
"df = pd.get_dummies(df_clean, columns=['country'], drop_first=True)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 49,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Выбор признаков для кластеризации\n",
|
|||
|
"features = ['index price', 'inflationrate', 'oil prices', 'exchange_rate', 'gdppercent', \n",
|
|||
|
" 'percapitaincome', 'unemploymentrate', 'manufacturingoutput', 'tradebalance', 'USTreasury']"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Стандартизируем, чтобы устранить влияние масштаба.\n",
|
|||
|
"А также понизим размерность с помощью РСА для уменьшения количества признаков для визуализации данных."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 50,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Предобработка данных: стандартизация\n",
|
|||
|
"scaler = StandardScaler()\n",
|
|||
|
"scaled_data = scaler.fit_transform(df[features])\n",
|
|||
|
"\n",
|
|||
|
"# Понижение размерности с помощью PCA\n",
|
|||
|
"pca = PCA(n_components=2)\n",
|
|||
|
"pca_data = pca.fit_transform(scaled_data)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Используем метод главных компонент (PCA) для уменьшения размерности данных до 2D для визуализации."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 51,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA0wAAAIjCAYAAAAwSJuMAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3gU1dcH8O/MbE1PIAVI6L33ItK79I4VsCEgiKgv2MWGIgg/EKzYAZGOKL0jIr0rHRJKSAjp2TI7M+8fkUhMNtkN2dTv53l4lN1zZ84GSPbsvfdcQdM0DURERERERJSFWNgJEBERERERFVUsmIiIiIiIiJxgwUREREREROQECyYiIiIiIiInWDARERERERE5wYKJiIiIiIjICRZMRERERERETrBgIiIiIiIicoIFExERERERkRMsmIiIiP5j1KhRqFy5cmGnkSeXL1+GIAj49ttvCzsVIqISgQUTEVEx9u2330IQBBw8eDDT44mJiWjZsiVMJhM2bNiQ41hBELBnz54sz2uahoiICAiCgD59+ngk/4KWlJSEadOmoVGjRvDx8YHZbEb9+vUxZcoUXL9+vcDyWLBgAQsaIqJiQlfYCRARUf5KSkpC9+7dcfz4caxatQo9e/bMMd5kMmHx4sW4//77Mz2+c+dOXL16FUaj0ZPpFpiLFy+ia9euiIyMxNChQ/H000/DYDDg+PHjWLhwIVatWoWzZ88WSC4LFixA2bJlMWrUqHy/dqVKlWCxWKDX6/P92kREpRELJiKiEiQ5ORk9evTA0aNHsXLlSvTq1SvXMQ888ACWLVuGuXPnQqf798fC4sWL0axZM9y6dcuTKRcIh8OBQYMG4ebNm9ixY0eW4vC9997Dhx9+WEjZ5Q+HwwFVVWEwGGAymQo7HSKiEoNL8oiISoiUlBT07NkThw8fxooVK9C7d2+Xxj344IOIi4vD5s2bMx6z2+1Yvnw5HnrooWzHqKqKOXPmoF69ejCZTAgNDcWYMWMQHx+fKW7NmjXo3bs3ypcvD6PRiGrVquGdd96BoiiZ4jp27Ij69evj9OnT6NSpE7y8vFChQgXMmDEjy73nzZuHevXqwcvLC4GBgWjevDkWL16c42tcsWIFjh07hldffTVLsQQAfn5+eO+995yO37FjBwRBwI4dOzI9nt1+oejoaIwePRrh4eEwGo0oV64c+vfvj8uXLwMAKleujFOnTmHnzp0ZSyI7duyYMT4hIQGTJk1CREQEjEYjqlevjg8//BCqqma578yZMzFnzhxUq1YNRqMRp0+fzjanUaNGwcfHB9euXcOAAQPg4+OD4OBgvPjii1n+LOLi4vDoo4/Cz88PAQEBGDlyJI4dO8Z9UURUanGGiYioBEhNTUWvXr1w4MABLF++3K09R5UrV0abNm2wZMmSjBmp9evXIzExESNGjMDcuXOzjBkzZgy+/fZbjB49GhMnTsSlS5fwySef4MiRI/j9998zloN9++238PHxweTJk+Hj44Nt27bhjTfeQFJSEj766KNM14yPj0fPnj0xaNAgDBs2DMuXL8eUKVPQoEGDjLy+/PJLTJw4EUOGDMFzzz0Hq9WK48eP488//3Ra3AHA2rVrAQCPPvqoy1+XvBo8eDBOnTqFCRMmoHLlyoiJicHmzZsRGRmJypUrY86cOZgwYQJ8fHzw6quvAgBCQ0MBAGlpaejQoQOuXbuGMWPGoGLFiti7dy9efvll3LhxA3PmzMl0r2+++QZWqxVPP/00jEYjgoKCMhVWd1MUBT169ECrVq0wc+ZMbNmyBbNmzUK1atUwduxYAOmFcN++fbF//36MHTsWtWvXxpo1azBy5EjPfcGIiIo6jYiIiq1vvvlGA6BVqlRJ0+v12urVq90ee+DAAe2TTz7RfH19tbS0NE3TNG3o0KFap06dNE3TtEqVKmm9e/fOGLd7924NgLZo0aJM19uwYUOWx+9c725jxozRvLy8NKvVmvFYhw4dNADa999/n/GYzWbTwsLCtMGDB2c81r9/f61evXouv8Y7mjRpovn7+7scP3LkSK1SpUoZv9++fbsGQNu+fXumuEuXLmkAtG+++UbTNE2Lj4/XAGgfffRRjtevV6+e1qFDhyyPv/POO5q3t7d29uzZTI9PnTpVkyRJi4yMzHRfPz8/LSYmJsec7rweANrbb7+dKbZJkyZas2bNMn6/YsUKDYA2Z86cjMcURdE6d+6c5ZpERKUFl+QREZUAN2/ehMlkQkRERJ7GDxs2DBaLBevWrUNycjLWrVvndMZm2bJl8Pf3R7du3XDr1q2MX82aNYOPjw+2b9+eEWs2mzP+Pzk5Gbdu3UK7du2QlpaGv//+O9N1fXx88Mgjj2T83mAwoGXLlrh48WLGYwEBAbh69SoOHDjg1utLSkqCr6+vW2Pywmw2w2AwYMeOHVmWJ7pi2bJlaNeuHQIDAzN9bbt27QpFUbBr165M8YMHD0ZwcLDL13/mmWcy/b5du3aZvr4bNmyAXq/HU089lfGYKIoYP36826+FiKikYMFERFQCfP755zAYDOjZsyfOnDmT8biiKIiOjs70y263ZxkfHByMrl27YvHixVi5ciUURcGQIUOyvde5c+eQmJiIkJAQBAcHZ/qVkpKCmJiYjNhTp05h4MCB8Pf3h5+fH4KDgzOKosTExEzXDQ8PhyAImR4LDAzMVHhMmTIFPj4+aNmyJWrUqIHx48fj999/z/Xr4+fnh+Tk5Fzj7pXRaMSHH36I9evXIzQ0FO3bt8eMGTMQHR3t0vhz585hw4YNWb6uXbt2BYBMX1sAqFKlisu5mUymLMXVf7++V65cQbly5eDl5ZUprnr16i7fh4iopOEeJiKiEqBu3br47bff0KVLF3Tr1g2///47IiIiEBUVleVN9fbt2zM1GbjjoYcewlNPPYXo6Gj06tULAQEB2d5LVVWEhIRg0aJF2T5/5015QkICOnToAD8/P7z99tuoVq0aTCYTDh8+jClTpmTZayNJUrbX0zQt4//r1KmDM2fOYN26ddiwYQNWrFiBBQsW4I033sC0adOcfXlQu3ZtHDlyBFFRUXmahftvIXfHfxsmAMCkSZPQt29frF69Ghs3bsTrr7+O6dOnY9u2bWjSpEmO91FVFd26dcP//d//Zft8zZo1M/3+7hm83Dj7+hIRUc5YMBERlRAtW7bE6tWr0bt3b3Tr1g27d+9GWFhYpu53ANCoUaNsxw8cOBBjxozBvn37sHTpUqf3qVatGrZs2YK2bdvm+IZ9x44diIuLw8qVK9G+ffuMxy9duuTmK8vM29sbw4cPx/Dhw2G32zFo0CC89957ePnll5220+7bty+WLFmCH3/8ES+//LLb9wwMDASQXgTe7cqVK9nGV6tWDS+88AJeeOEFnDt3Do0bN8asWbPw448/AnBegFWrVg0pKSkZM0oFrVKlSti+fTvS0tIyzTKdP3++UPIhIioKuCSPiKgE6dKlC5YsWYLz58+jZ8+esNvt6Nq1a6Zfd978/5ePjw8+/fRTvPXWW+jbt6/TewwbNgyKouCdd97J8pzD4cgoKu7MaNw9Q2S327FgwYI8v764uLhMvzcYDKhbty40TYMsy07HDRkyBA0aNMB7772HP/74I8vzycnJGR3rslOpUiVIkpRlD9F/X0taWhqsVmumx6pVqwZfX1/YbLaMx7y9vbMUX0D61/aPP/7Axo0bszyXkJAAh8PhNMf80KNHD8iyjC+//DLjMVVVMX/+fI/el4ioKOMMExFRCTNw4EB8+eWXePzxx9GvXz9s2LDB5YNMXWkf3aFDB4wZMwbTp0/H0aNH0b17d+j1epw7dw7Lli3D//73PwwZMgT33XcfAgMDMXLkSEycOBGCIOCHH37IVEC5q3v37ggLC0Pbtm0RGhqKv/76C5988gl69+6dY1MHvV6PlStXomvXrmjfvj2GDRuGtm3bQq/X49SpU1i8eDECAwOdnsXk7++PoUOHYt68eRAEAdWqVcO6deuy7Ck6e/YsunTpgmHDhqFu3brQ6XRYtWoVbt68iREjRmTENWvWDJ9++ineffddVK9eHSEhIejcuTNeeuk
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Неиерархическая\n",
|
|||
|
"kmeans = KMeans(n_clusters=3, random_state=42)\n",
|
|||
|
"kmeans_labels = kmeans.fit_predict(scaled_data)\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация кластеров K-Means\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"sns.scatterplot(x=pca_data[:, 0], y=pca_data[:, 1], hue=kmeans_labels, palette='inferno', s=100)\n",
|
|||
|
"plt.title('K-Means Clustering')\n",
|
|||
|
"plt.xlabel('Количество кластеров')\n",
|
|||
|
"plt.ylabel('Инерция')\n",
|
|||
|
"plt.show()\n",
|
|||
|
"\n",
|
|||
|
"#оценка неиерархического\n",
|
|||
|
"silhouette_avg_kmeans = silhouette_score(scaled_data, kmeans_labels)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Для иерархической кластеризации потребуется предварительно определить количество кластеров, так как она не возвращает метки кластеров."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 52,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAzcAAAJcCAYAAADElVr3AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACFTklEQVR4nO3dd3gU5cLG4WdTSUIvoQakVwFBpCqoIFUBBUS6SBFBKSLCEQsKAsoRFAUEFEHAQvUcUBREUBSQIgIWmigBBKSXdDLfH/l2zm6ySXaTTbKZ/O7r4mI3Mzvz7uzszDz7lrEZhmEIAAAAAHI5v5wuAAAAAAB4A+EGAAAAgCUQbgAAAABYAuEGAAAAgCUQbgAAAABYAuEGAAAAgCUQbgAAAABYAuEGAAAAgCUQbgAAAABYAuEGAAAAgCUQbgDAR33wwQey2WzavXt3imkDBgyQzWZTnTp1cqBkAAD4JsINAOQyR48e1dKlS3O6GAAA+JyAnC4AAMAzU6ZMUWBgoKpUqZLTRQEAwKdQcwMAucixY8e0dOlSDR06VKVKlUoxfenSpWrYsKFCQkJUtGhR9ezZU5GRkU7ztGrVSnXq1NGePXvUrFkzhYSEqGLFipo3b57TfHFxcXrhhRfUsGFDFSpUSGFhYbrzzjv1zTffOM33559/ymazacaMGZo5c6YqVKigkJAQtWzZUgcPHjTnO3funEqUKKFWrVrJMAzz70ePHlVYWJgefvhhpzK2atXKaT27du2SzWaTzWYz/7ZlyxbZbDZt2bLFad4BAwbolltucfpbYmKiZs2apdq1aytfvnwqWbKkhg4dqkuXLqXYjl988YVatmypAgUKqGDBgmrUqJGWL1+eZvmmTJkiPz8/p/m+++47de/eXeXLl1dwcLAiIiI0evRoRUdHp1jnypUrdfvtt6tAgQLm+7RvVwCAewg3AJCLTJ48WQEBAXr22WdTTJsyZYr69eunqlWr6o033tCoUaP09ddf66677tLly5ed5r106ZI6dOighg0b6rXXXlO5cuU0bNgwvf/+++Y8V69e1cKFC9WqVStNnz5dL730kv755x+1bdtW+/btS7H+JUuW6K233tLw4cM1YcIEHTx4UPfcc4/Onj0rSQoPD9fcuXO1detWzZ49W1JS4BgwYIAKFCigOXPmpPneXb1nTwwdOlTPPPOMmjdvrjfffFOPPvqoli1bprZt2yo+Pt6c74MPPlDHjh118eJFTZgwQdOmTVP9+vW1YcOGVJe9aNEiTZw4UTNmzFCvXr3Mv69YsUJRUVEaNmyYZs+erbZt22r27Nnq16+f0+u3b9+uHj166ObNm5o2bZo+/PBDzZw5M1PvFwDyJAMA4JMWLVpkSDJ27dplGIZhHDt2zAgICDCeeuopwzAMo2XLlkbt2rUNwzCMP//80/D39zemTJnitIwDBw4YAQEBTn9v2bKlIcn497//bf4tNjbWqF+/vhEeHm7ExcUZhmEYCQkJRmxsrNPyLl26ZJQsWdIYOHCg+bfjx48bkoyQkBDj5MmT5t937txpSDJGjx7ttIxHHnnECA0NNQ4fPmy8/vrrhiRj7dq1TvO0bNnSaNmypfn8888/NyQZ7dq1MxxPXVu3bjUkGZs3b3Z6ff/+/Y0KFSqYz7/77jtDkrFs2TKn+TZs2OD098uXLxsFChQwGjdubERHRzvNm5iY6LJ869evNwICAoynn37aSC4qKirF36ZOnWrYbDbjr7/+Mv82YcIEQ5Lx999/m3+zb9fXX389xTIAAK5RcwMAuYS91mb8+PEppq1evVqJiYnq0aOHzp8/b/4rVaqUqlatmqIpWUBAgIYOHWo+DwoK0tChQ3Xu3Dnt2bNHkuTv76+goCBJSTUsFy9eVEJCgm6//Xbt3bs3RRm6dOmismXLms/vuOMONW7cWJ9//rnTfG+//bYKFSqkbt266fnnn1ffvn3VuXPnVN+3YRiaMGGCHnroITVu3NhpWnh4uCTp5MmTqb5eSqpBKVSokNq0aeO0fRo2bKj8+fOb22fjxo26du2axo8fr3z58jktw7E5nN2PP/6oHj166KGHHtLrr7+eYnpISIj5+MaNGzp//ryaNWsmwzD0008/mdOuXbsmPz8/FS5cOM33AQBIG+EGAHKBP/74Qx9++KGGDBmi0qVLp5h+5MgRGYahqlWrqkSJEk7/fvvtN507d85p/jJlyigsLMzpb9WqVZOU1IfGbvHixapbt67y5cunYsWKqUSJElq/fr2uXLmSogxVq1ZN8bdq1ao5LU+SihYtqrfeekv79+9XoUKF9NZbb6X53pctW6ZffvlFr776aopplSpVUqlSpTRjxgzt37/fDC2xsbEpts+VK1cUHh6eYvtcv37d3D7Hjh2TJLeG2D516pQ6duyoGzdu6MKFCy7Dz4kTJzRgwAAVLVpU+fPnV4kSJdSyZUtJctqGTZs2VWJiokaOHKljx47p/PnzLvsCAQDSxmhpAJALTJkyJdW+NlJSzYrNZtMXX3whf3//FNPz58/v8TqXLl2qAQMGqEuXLnrmmWcUHh4uf39/TZ061QwBGfXll19KSur7c/LkyVRrLOLi4vT888/rscceM8OXo6CgIC1YsEC9evVSvXr1nKZVqFDBfJyYmKjw8HAtW7bM5XpKlCjh8Xs4evSoGjRooJkzZ6pv375avHix+vfvb06/efOm2rRpo4sXL+rZZ59VjRo1FBYWplOnTmnAgAFKTEw05+3Zs6f27t2r2bNna/78+R6XBQCQhHADAD7u+PHjWrJkiYYNG6YyZcq4nKdy5coyDEMVK1Z0GQKSO336tG7cuOFUe3P48GFJMkcZW7lypSpVqqTVq1c71Uq8+OKLLpd55MiRFH87fPhwilHLNmzYoIULF2rcuHFatmyZ+vfvr507dyogIOUpac6cOTp37pxeeumlVN9Lp06ddOrUKe3fv98chez111/XoUOHzHkqV66sTZs2qXnz5k5NxZKrXLmyJOngwYPpDrVdunRpff755ypZsqQ+++wzPf300+rQoYMZlA4cOKDDhw9r8eLFTgMIbNy4McWy/Pz8NGPGDB04cEDHjx/XnDlzdPbsWfXp0yfNMgAAnNEsDQB83Kuvvip/f3+XfW3sHnzwQfn7+2vSpElOwyxLSX1WLly44PS3hIQEvfvuu+bzuLg4vfvuuypRooQaNmwoSWYNkOPydu7cqe3bt7ssw9q1a3Xq1Cnz+Y8//qidO3eqffv25t8uX76sQYMG6Y477tCrr76qhQsXau/evS6bnF27dk1TpkzR6NGjXQ577ahAgQJq3ry5WrdurdatW6doumcfieyVV15J8dqEhARzNLn77rtPBQoU0NSpUxUTE+M0X/LtWq1aNZUsWVKSNHv2bLNZmZ2r7WcYht58802X72H27NnavHmzli1bptatW6t58+ZpvmcAQErU3ACAj9u3b59GjBiRaq2NlFTjMHnyZE2YMEF//vmnunTpogIFCuj48eNas2aNhgwZorFjx5rzlylTRtOnT9eff/6patWq6ZNPPtG+ffs0f/58BQYGSkqqEVm9erW6du2qjh076vjx45o3b55q1aql69evpyhDlSpV1KJFCw0bNkyxsbGaNWuWihUrpnHjxpnzjBw5UhcuXNCmTZvk7++vdu3aadCgQZo8ebI6d+7s1LRs7969Kl68uNPrM6ply5YaOnSopk6dqn379um+++5TYGCgjhw5ohUrVujNN99Ut27dVLBgQc2cOVODBg1So0aN1KtXLxUpUkQ///yzoqKitHjxYpfLL1WqlF5//XUNGjRIffr0UYcOHVSjRg1VrlxZY8eO1alTp1SwYEGtWrXKZV+aX375RePGjdNLL72kRo0aZfr9AkCelVPDtAEA0mYfCjo4ONhpiGU7x6Gg7VatWmW0aNHCCAsLM8LCwowaNWoYw4cPNw4dOpTidbt37zaaNm1q5MuXz6hQoYLx9ttvOy0rMTHRePXVV40KFSoYwcHBxm233WasW7cuxTDLjkMW//vf/zYiIiKM4OBg48477zR+/vlnc77PPvssxRDUhmEYV69eNSpUqGDUq1f
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x700 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Иерархическая кластеризация\n",
|
|||
|
"linked = linkage(scaled_data, 'ward')\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация\n",
|
|||
|
"plt.figure(figsize=(10, 7))\n",
|
|||
|
"dendrogram(linked, orientation='top', distance_sort='descending', show_leaf_counts=True)\n",
|
|||
|
"plt.title('Иерархическая')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 53,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Определение меток\n",
|
|||
|
"n_clusters = 3\n",
|
|||
|
"hierarchical_labels = fcluster(linked, n_clusters, criterion='maxclust')"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 54,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA0wAAAIjCAYAAAAwSJuMAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3xUVfrH8c+901JIoYXeiwiIUhSwgYooVhR11bWvZRUrtnV3f5Z1Fdfee1kLYkfRVbCLvYBYUGkC0jtJSJl2z++POwmEtJmQSf2+Xy9eu5l7zr1nJjG5z33OeY5ljDGIiIiIiIhIOXZ9D0BERERERKShUsAkIiIiIiJSCQVMIiIiIiIilVDAJCIiIiIiUgkFTCIiIiIiIpVQwCQiIiIiIlIJBUwiIiIiIiKVUMAkIiIiIiJSCQVMIiIiIiIilVDAJCIiIiIiUgkFTCIizcx///tfLMviu+++K3fsjDPOwLIsBg4cWA8jExERaXgUMImICACLFi3iueeeq+9hiIiINCje+h6AiIg0DDfddBM+n4/evXvX91BEREQaDGWYRESExYsX89xzz3HeeefRvn37csefe+45hg4dSmpqKq1ateLEE09k+fLlZdqMHj2agQMHMnv2bPbee29SU1Pp0aMHDz/8cJl2oVCIa6+9lqFDh5KVlUV6ejr77bcfH330UZl2S5cuxbIsbr/9du666y66detGamoqo0aN4ueffy5tt27dOtq2bcvo0aMxxpS+vmjRItLT0/nTn/5UZoyjR48uc51vv/0Wy7KwLKv0tY8//hjLsvj444/LtD3jjDPo3r17mdccx+Huu+9mwIABpKSk0K5dO8477zw2b95c7nN85513GDVqFBkZGWRmZrLnnnvy/PPPVzm+m266Cdu2y7T79NNPOf744+natSuBQIAuXbpw2WWXUVRUVO6ar7zyCsOGDSMjI6P0fZZ8riIiUj0FTCIiwr///W+8Xi9XX311uWM33XQTp512Gn369OHOO+/k0ksv5YMPPmD//fdny5YtZdpu3ryZww47jKFDh3LrrbfSuXNnzj//fJ588snSNnl5eTz++OOMHj2a//znP1x//fWsX7+eQw45hLlz55a7/jPPPMO9997LxIkTueaaa/j555858MADWbt2LQA5OTk89NBDfPLJJ9x3332AG8ScccYZZGRk8OCDD1b53it6z4k477zzuPLKK9lnn3245557OPPMM5kyZQqHHHII4XC4tN1///tfDj/8cDZt2sQ111zDLbfcwh577MGMGTMqPfdTTz3FP//5T26//XZOPvnk0tdffvllCgsLOf/887nvvvs45JBDuO+++zjttNPK9P/yyy854YQTiEaj3HLLLTz77LPcddddO/V+RUSaHSMiIs3KU089ZQDz7bffGmOMWbx4sfF6vebiiy82xhgzatQoM2DAAGOMMUuXLjUej8fcdNNNZc7x008/Ga/XW+b1UaNGGcDccccdpa8Fg0Gzxx57mJycHBMKhYwxxkQiERMMBsucb/PmzaZdu3bmrLPOKn1tyZIlBjCpqalmxYoVpa9//fXXBjCXXXZZmXOcdNJJJi0tzSxYsMDcdtttBjCvv/56mTajRo0yo0aNKv367bffNoA59NBDzfZ/Ej/55BMDmA8//LBM/9NPP91069at9OtPP/3UAGbKlCll2s2YMaPM61u2bDEZGRlm+PDhpqioqExbx3EqHN///vc/4/V6zeWXX252VFhYWO61yZMnG8uyzLJly0pfu+aaawxgVq9eXfpayed62223lTuHiIiUpwyTiEgzV5Jd+tvf/lbu2GuvvYbjOJxwwgls2LCh9F/79u3p06dPuWl0Xq+X8847r/Rrv9/Peeedx7p165g9ezYAHo8Hv98PuJmgTZs2EYlEGDZsGHPmzCk3hvHjx9OpU6fSr/faay+GDx/O22+/Xabd/fffT1ZWFscddxz/93//x6mnnsrRRx9d6fs2xnDNNdcwYcIEhg8fXuZYTk4OACtWrKi0P7iZnqysLA4++OAyn8/QoUNp0aJF6efz3nvvkZ+fz9/+9jdSUlLKnGP7qYAlvvnmG0444QQmTJjAbbfdVu54ampq6f8vKChgw4YN7L333hhj+P7770uP5efnY9s22dnZVb4PERGpnAImEZFm7Pfff+fZZ5/l3HPPpUOHDuWOL1y4EGMMffr0oW3btmX+/frrr6xbt65M+44dO5Kenl7mtb59+wLumqQSTz/9NIMGDSIlJYXWrVvTtm1b/ve//5Gbm1tuDH369Cn3Wt++fcucD6BVq1bce++9/Pjjj2RlZXHvvfdW+d6nTJnCvHnzuPnmm8sd69mzJ+3bt+f222/nxx9/LA2EgsFguc8nNzeXnJyccp/P1q1bSz+fxYsXA8RVrn3lypUcfvjhFBQUsHHjxgoDqj/++IMzzjiDVq1a0aJFC9q2bcuoUaMAynyGI0eOxHEcLrnkEhYvXsyGDRsqXFslIiKVU5U8EZFm7Kabbqp07RK4GSDLsnjnnXfweDzljrdo0SLhaz733HOcccYZjB8/niuvvJKcnBw8Hg+TJ08uDSxqaubMmYC7lmrFihWVZlZCoRD/93//x1/+8pfSgG57fr+fxx57jJNPPpndd9+9zLFu3bqV/n/HccjJyWHKlCkVXqdt27YJv4dFixYxZMgQ7rrrLk499VSefvppTj/99NLj0WiUgw8+mE2bNnH11VfTr18/0tPTWblyJWeccQaO45S2PfHEE5kzZw733Xcfjz76aMJjERERBUwiIs3WkiVLeOaZZzj//PPp2LFjhW169eqFMYYePXpUGFjsaNWqVRQUFJTJMi1YsACgtLrcK6+8Qs+ePXnttdfKZE+uu+66Cs+5cOHCcq8tWLCgXLW6GTNm8Pjjj3PVVVcxZcoUTj/9dL7++mu83vJ/6h588EHWrVvH9ddfX+l7OeKII1i5ciU//vhjafW52267jfnz55e26dWrF++//z777LNPmWlyO+rVqxcAP//8c7Vl2zt06MDbb79Nu3bteOONN7j88ss57LDDSoOvn376iQULFvD000+XKfLw3nvvlTuXbdvcfvvt/PTTTyxZsoQHH3yQtWvXcsopp1Q5BhER2UZT8kREmqmbb74Zj8dT4dqlEsceeywej4cbbrihTMlucNcAbdy4scxrkUiERx55pPTrUCjEI488Qtu2bRk6dChAaaZq+/N9/fXXfPnllxWO4fXXX2flypWlX3/zzTd8/fXXjBs3rvS1LVu2cPbZZ7PXXntx88038/jjjzNnzpwKp9vl5+dz0003cdlll1VYQn17GRkZ7LPPPowZM4YxY8aUm7ZYUoHuxhtvLNc3EomUVhEcO3YsGRkZTJ48meLi4jLtdvxc+/btS7t27QC47777SqfUlajo8zPGcM8991T4Hu677z4+/PBDpkyZwpgxY9hnn32qfM8iIlKWMkwiIs3U3LlzufDCCyvNLoGbGfn3v//NNddcw9KlSxk/fjwZGRksWbKEadOmce6553LFFVeUtu/YsSP/+c9/WLp0KX379uXFF19k7ty5PProo/h8PsDN3Lz22mscc8wxHH744SxZsoSHH36Y/v37s3Xr1nJj6N27N/vuuy/nn38+wWCQu+++m9atW3PVVVeVtrnkkkvYuHEj77//Ph6Ph0MPPZSzzz6bf//73xx99NFlptXNmTOHNm3alOlfU6NGjeK8885j8uTJzJ07l7Fjx+Lz+Vi4cCEvv/wy99xzD8cddxyZmZncddddnH322ey5556cfPLJtGzZkh9++IHCwkKefvrpCs/fvn17brvtNs4++2xOOeUUDjvsMPr160evXr244oorWLlyJZmZmbz66qsVrk2aN28eV111Fddffz177rnnTr9fEZFmqb7K84mISP0oKSseCATKlOsusX1Z8RKvvvqq2XfffU16erpJT083/fr1MxMnTjTz588v1++7774zI0eONCkpKaZbt27m/vvvL3Mux3HMzTffbLp162YCgYAZPHiweeutt8qV7N6+/PUdd9xhunTpYgKBgNlvv/3MDz/8UNrujTfeKFfO3Bh
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
" # Визуализация кластеров\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"sns.scatterplot(x=pca_data[:, 0], y=pca_data[:, 1], hue=hierarchical_labels, palette='inferno', s=100)\n",
|
|||
|
"plt.title('Иерархическая')\n",
|
|||
|
"plt.xlabel('PC1')\n",
|
|||
|
"plt.ylabel('PC2')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 55,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
" year index price log_indexprice inflationrate oil prices \\\n",
|
|||
|
"Cluster \n",
|
|||
|
"0 1998.054545 13563.522364 3.929091 0.054182 34.765091 \n",
|
|||
|
"1 2005.619048 7237.508776 3.713401 0.020680 48.031361 \n",
|
|||
|
"2 2009.294118 3554.822941 3.494118 0.022941 59.845294 \n",
|
|||
|
"\n",
|
|||
|
" exchange_rate gdppercent percapitaincome unemploymentrate \\\n",
|
|||
|
"Cluster \n",
|
|||
|
"0 85.857273 0.045818 7502.927273 0.061818 \n",
|
|||
|
"1 6.610340 0.029320 27037.510204 0.077823 \n",
|
|||
|
"2 1.000000 0.025294 49157.352941 0.058235 \n",
|
|||
|
"\n",
|
|||
|
" manufacturingoutput tradebalance USTreasury \n",
|
|||
|
"Cluster \n",
|
|||
|
"0 132.100000 -6.739455 0.063636 \n",
|
|||
|
"1 473.491633 34.495510 0.042993 \n",
|
|||
|
"2 251.887059 -555.851765 0.035294 \n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Добавление меток кластеров в исходный датафрейм\n",
|
|||
|
"df['Cluster'] = kmeans_labels\n",
|
|||
|
"\n",
|
|||
|
"# Удаление нечисловых столбцов перед вычислением среднего\n",
|
|||
|
"numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns\n",
|
|||
|
"cluster_analysis = df.groupby('Cluster')[numeric_columns].mean()\n",
|
|||
|
"\n",
|
|||
|
"# Вывод результата\n",
|
|||
|
"print(cluster_analysis)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 59,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Оценка для неиерархического: 0.20251\n",
|
|||
|
"Оценка для иерархического: 0.20251\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Оценка\n",
|
|||
|
"print(f\"Оценка для неиерархического: {round(silhouette_avg_kmeans,5)}\")\n",
|
|||
|
"\n",
|
|||
|
"silhouette_avg = silhouette_score(scaled_data, kmeans_labels)\n",
|
|||
|
"print(f\"Оценка для иерархического: {round(silhouette_avg,5)}\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Оценки совпадают, потому что, вероятно, для расхождения в оценке нужно большее число различных данных."
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": "Scripts",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.12.0"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|