AIM-PIbd-32-Filippov-D-S/Lab_5/lab5.ipynb

1290 lines
2.7 MiB
Plaintext
Raw Normal View History

2024-12-08 22:51:37 +04:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Лабораторная работа №5\n",
"\n",
"*Вариант задания:* "
]
},
{
"cell_type": "code",
2024-12-14 07:16:17 +04:00
"execution_count": 27,
2024-12-08 22:51:37 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['summary', 'city', 'state', 'date_time', 'shape', 'duration', 'stats',\n",
" 'report_link', 'text', 'posted', 'city_latitude', 'city_longitude'],\n",
" dtype='object')\n"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from scipy.cluster.hierarchy import dendrogram, linkage, fcluster\n",
"from sklearn.cluster import KMeans\n",
"from sklearn.decomposition import PCA\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.metrics import silhouette_score\n",
"\n",
2024-12-14 07:16:17 +04:00
"df = pd.read_csv(\"../../datasets/nuforc_reports.csv\")\n",
2024-12-08 22:51:37 +04:00
"df = df.iloc[:1000].dropna()\n",
"print(df.columns)"
]
},
{
"cell_type": "code",
2024-12-14 07:16:17 +04:00
"execution_count": 28,
2024-12-08 22:51:37 +04:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>summary</th>\n",
" <th>city</th>\n",
" <th>state</th>\n",
" <th>date_time</th>\n",
" <th>shape</th>\n",
" <th>duration</th>\n",
" <th>stats</th>\n",
" <th>report_link</th>\n",
" <th>text</th>\n",
" <th>posted</th>\n",
" <th>city_latitude</th>\n",
" <th>city_longitude</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Viewed some red lights in the sky appearing to...</td>\n",
" <td>Visalia</td>\n",
" <td>CA</td>\n",
" <td>2021-12-15T21:45:00</td>\n",
" <td>light</td>\n",
" <td>2 minutes</td>\n",
" <td>Occurred : 12/15/2021 21:45 (Entered as : 12/...</td>\n",
" <td>http://www.nuforc.org/webreports/165/S165881.html</td>\n",
" <td>Viewed some red lights in the sky appearing to...</td>\n",
" <td>2021-12-19T00:00:00</td>\n",
" <td>36.356650</td>\n",
" <td>-119.347937</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Look like 1 or 3 crafts from North traveling s...</td>\n",
" <td>Cincinnati</td>\n",
" <td>OH</td>\n",
" <td>2021-12-16T09:45:00</td>\n",
" <td>triangle</td>\n",
" <td>14 seconds</td>\n",
" <td>Occurred : 12/16/2021 09:45 (Entered as : 12/...</td>\n",
" <td>http://www.nuforc.org/webreports/165/S165888.html</td>\n",
" <td>Look like 1 or 3 crafts from North traveling s...</td>\n",
" <td>2021-12-19T00:00:00</td>\n",
" <td>39.174503</td>\n",
" <td>-84.481363</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>One red light moving switly west to east, beco...</td>\n",
" <td>Knoxville</td>\n",
" <td>TN</td>\n",
" <td>2021-12-10T19:30:00</td>\n",
" <td>triangle</td>\n",
" <td>20-30 seconds</td>\n",
" <td>Occurred : 12/10/2021 19:30 (Entered as : 12/...</td>\n",
" <td>http://www.nuforc.org/webreports/165/S165825.html</td>\n",
" <td>One red light moving switly west to east, beco...</td>\n",
" <td>2021-12-19T00:00:00</td>\n",
" <td>35.961561</td>\n",
" <td>-83.980115</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>I'm familiar with all the fakery and UFO sight...</td>\n",
" <td>Fullerton</td>\n",
" <td>CA</td>\n",
" <td>2020-07-07T23:00:00</td>\n",
" <td>unknown</td>\n",
" <td>2 minutes</td>\n",
" <td>Occurred : 7/7/2020 23:00 (Entered as : 07/07...</td>\n",
" <td>http://www.nuforc.org/webreports/157/S157444.html</td>\n",
" <td>I'm familiar with all the fakery and UFO sight...</td>\n",
" <td>2020-07-09T00:00:00</td>\n",
" <td>33.877422</td>\n",
" <td>-117.924978</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>I was driving up lakes mead towards the lake a...</td>\n",
" <td>Las Vegas</td>\n",
" <td>NV</td>\n",
" <td>2020-04-23T03:00:00</td>\n",
" <td>oval</td>\n",
" <td>10 minutes</td>\n",
" <td>Occurred : 4/23/2020 03:00 (Entered as : 4/23...</td>\n",
" <td>http://www.nuforc.org/webreports/155/S155608.html</td>\n",
" <td>I was driving up lakes mead towards the lake a...</td>\n",
" <td>2020-05-01T00:00:00</td>\n",
" <td>36.141246</td>\n",
" <td>-115.186592</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" summary city state \\\n",
"0 Viewed some red lights in the sky appearing to... Visalia CA \n",
"1 Look like 1 or 3 crafts from North traveling s... Cincinnati OH \n",
"3 One red light moving switly west to east, beco... Knoxville TN \n",
"5 I'm familiar with all the fakery and UFO sight... Fullerton CA \n",
"6 I was driving up lakes mead towards the lake a... Las Vegas NV \n",
"\n",
" date_time shape duration \\\n",
"0 2021-12-15T21:45:00 light 2 minutes \n",
"1 2021-12-16T09:45:00 triangle 14 seconds \n",
"3 2021-12-10T19:30:00 triangle 20-30 seconds \n",
"5 2020-07-07T23:00:00 unknown 2 minutes \n",
"6 2020-04-23T03:00:00 oval 10 minutes \n",
"\n",
" stats \\\n",
"0 Occurred : 12/15/2021 21:45 (Entered as : 12/... \n",
"1 Occurred : 12/16/2021 09:45 (Entered as : 12/... \n",
"3 Occurred : 12/10/2021 19:30 (Entered as : 12/... \n",
"5 Occurred : 7/7/2020 23:00 (Entered as : 07/07... \n",
"6 Occurred : 4/23/2020 03:00 (Entered as : 4/23... \n",
"\n",
" report_link \\\n",
"0 http://www.nuforc.org/webreports/165/S165881.html \n",
"1 http://www.nuforc.org/webreports/165/S165888.html \n",
"3 http://www.nuforc.org/webreports/165/S165825.html \n",
"5 http://www.nuforc.org/webreports/157/S157444.html \n",
"6 http://www.nuforc.org/webreports/155/S155608.html \n",
"\n",
" text posted \\\n",
"0 Viewed some red lights in the sky appearing to... 2021-12-19T00:00:00 \n",
"1 Look like 1 or 3 crafts from North traveling s... 2021-12-19T00:00:00 \n",
"3 One red light moving switly west to east, beco... 2021-12-19T00:00:00 \n",
"5 I'm familiar with all the fakery and UFO sight... 2020-07-09T00:00:00 \n",
"6 I was driving up lakes mead towards the lake a... 2020-05-01T00:00:00 \n",
"\n",
" city_latitude city_longitude \n",
"0 36.356650 -119.347937 \n",
"1 39.174503 -84.481363 \n",
"3 35.961561 -83.980115 \n",
"5 33.877422 -117.924978 \n",
"6 36.141246 -115.186592 "
]
},
2024-12-14 07:16:17 +04:00
"execution_count": 28,
2024-12-08 22:51:37 +04:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
2024-12-14 07:16:17 +04:00
"execution_count": 29,
2024-12-08 22:51:37 +04:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>city_latitude</th>\n",
" <th>city_longitude</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>712.000000</td>\n",
" <td>712.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>39.350240</td>\n",
" <td>-97.057660</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>5.558375</td>\n",
" <td>17.807918</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>25.774143</td>\n",
" <td>-149.336500</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>34.950725</td>\n",
" <td>-116.385628</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>39.597100</td>\n",
" <td>-93.326900</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>43.262550</td>\n",
" <td>-82.476700</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>61.214900</td>\n",
" <td>-61.260300</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" city_latitude city_longitude\n",
"count 712.000000 712.000000\n",
"mean 39.350240 -97.057660\n",
"std 5.558375 17.807918\n",
"min 25.774143 -149.336500\n",
"25% 34.950725 -116.385628\n",
"50% 39.597100 -93.326900\n",
"75% 43.262550 -82.476700\n",
"max 61.214900 -61.260300"
]
},
2024-12-14 07:16:17 +04:00
"execution_count": 29,
2024-12-08 22:51:37 +04:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe()"
]
},
{
"cell_type": "code",
2024-12-14 07:16:17 +04:00
"execution_count": 30,
2024-12-08 22:51:37 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"summary 0\n",
"city 0\n",
"state 0\n",
"date_time 0\n",
"shape 0\n",
"duration 0\n",
"stats 0\n",
"report_link 0\n",
"text 0\n",
"posted 0\n",
"city_latitude 0\n",
"city_longitude 0\n",
"dtype: int64\n",
"summary False\n",
"city False\n",
"state False\n",
"date_time False\n",
"shape False\n",
"duration False\n",
"stats False\n",
"report_link False\n",
"text False\n",
"posted False\n",
"city_latitude False\n",
"city_longitude False\n",
"dtype: bool\n"
]
}
],
"source": [
"# Процент пропущенных значений признаков\n",
"for i in df.columns:\n",
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
" if null_rate > 0:\n",
" print(f'{i} Процент пустых значений: %{null_rate:.2f}')\n",
"\n",
"print(df.isnull().sum())\n",
"\n",
"print(df.isnull().any())"
]
},
{
"cell_type": "code",
2024-12-14 07:16:17 +04:00
"execution_count": 31,
2024-12-08 22:51:37 +04:00
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"summary object\n",
"city object\n",
"state object\n",
"date_time object\n",
"shape object\n",
"duration object\n",
"stats object\n",
"report_link object\n",
"text object\n",
"posted object\n",
"city_latitude float64\n",
"city_longitude float64\n",
"dtype: object"
]
},
2024-12-14 07:16:17 +04:00
"execution_count": 31,
2024-12-08 22:51:37 +04:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Проверка типов столбцов\n",
"df.dtypes"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
2024-12-14 07:16:17 +04:00
"### Атрибуты \n",
"\n",
"### **Атрибуты и их описание**\n",
"\n",
" **`city` (object)** \n",
" - **Описание**: Город, где произошло наблюдение. \n",
" - **Применение**: Географическая группировка или создание карт по популярности наблюдений в различных городах.\n",
"\n",
" **`state` (object)** \n",
" - **Описание**: Штат, где произошло наблюдение. \n",
" - **Применение**: Анализ распределения по штатам, создание региональных кластеров наблюдений.\n",
"\n",
" **`city_latitude` (float64)** \n",
" - **Описание**: Географическая широта города наблюдения. \n",
" - **Применение**: Географический анализ и кластеризация наблюдений.\n",
2024-12-08 22:51:37 +04:00
"\n",
2024-12-14 07:16:17 +04:00
" **`city_longitude` (float64)** \n",
" - **Описание**: Географическая долгота города наблюдения. \n",
" - **Применение**: Географический анализ и визуализация наблюдений на карте.\n",
2024-12-08 22:51:37 +04:00
"\n",
2024-12-14 07:16:17 +04:00
"\n",
"### **Бизнес-цель**: \n",
"**Выявление географических кластеров активности наблюдений НЛО для исследования и стратегического планирования.\n",
"#### **Описание цели**:\n",
"Разделить географические области (широты) на кластеры, чтобы определить зоны с разной интенсивностью наблюдений. \n",
"\n",
"\n",
"#### **Потенциальное применение**:\n",
"- Научные организации могут использовать результаты для анализа феноменов НЛО.\n",
"- Туристические компании могут разработать маршруты в популярные зоны наблюдений.\n",
"- СМИ и исследовательские проекты могут строить прогнозы о возможных местах будущих наблюдений.\n"
2024-12-08 22:51:37 +04:00
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Очистка данных"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Цель: Упростить набор данных, удалив несущественные столбцы, чтобы сосредоточиться на ключевых атрибутах, которые будут использоваться для кластеризации и анализа.\n",
"\n",
"Столбцы (такието) несущественны для анализа, они не содержат ценной информации для решения задачи."
]
},
{
"cell_type": "code",
2024-12-14 07:16:17 +04:00
"execution_count": 32,
2024-12-08 22:51:37 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" city state date_time shape posted \\\n",
"0 Visalia CA 2021-12-15T21:45:00 light 2021-12-19T00:00:00 \n",
"1 Cincinnati OH 2021-12-16T09:45:00 triangle 2021-12-19T00:00:00 \n",
"3 Knoxville TN 2021-12-10T19:30:00 triangle 2021-12-19T00:00:00 \n",
"5 Fullerton CA 2020-07-07T23:00:00 unknown 2020-07-09T00:00:00 \n",
"6 Las Vegas NV 2020-04-23T03:00:00 oval 2020-05-01T00:00:00 \n",
"\n",
" city_latitude city_longitude \n",
"0 36.356650 -119.347937 \n",
"1 39.174503 -84.481363 \n",
"3 35.961561 -83.980115 \n",
"5 33.877422 -117.924978 \n",
"6 36.141246 -115.186592 \n"
]
}
],
"source": [
"# Удаление несущественных столбцов\n",
"columns_to_drop = [ \"summary\", \"stats\", \"report_link\", \"duration\", \"text\"] # Столбцы, которые можно удалить\n",
"#\"date_time\", \"posted\", \"city\", \"state\",\n",
"df_cleaned = df.drop(columns=columns_to_drop)\n",
"\n",
"print(df_cleaned.head()) # Вывод очищенного DataFrame"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Визуализация парных взаимосвязей\n",
"Визуализировать ключевые атрибуты миллиардеров для выявления закономерностей и связей между ними."
]
},
{
"cell_type": "code",
2024-12-14 07:16:17 +04:00
"execution_count": 33,
2024-12-08 22:51:37 +04:00
"metadata": {},
"outputs": [
{
"data": {
2024-12-14 07:16:17 +04:00
"image/png": "iVBORw0KGgoAAAANSUhEUgAABi8AAA7UCAYAAAC0+MtoAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/GU6VOAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdeXyU1dn/8e89k8meTMISIGBIk7AMEBZRxA0FbK0LWrG1v1qhVSuKVou27lZbn1ar1qWgj0tVVGrV2lop1bZuqH0EURQUTBACBIwJhCWZ7Pv8/jhOQiBIyExm7sl83q8Xr5DrkPuck5Bk7vs65zqWz+fzCQAAAAAAAAAAwCYc4R4AAAAAAAAAAADAvkheAAAAAAAAAAAAWyF5AQAAAAAAAAAAbIXkBQAAAAAAAAAAsBWSFwAAAAAAAAAAwFZIXgAAAAAAAAAAAFsheQEAAAAAAAAAAGyF5AUAAAAAAAAAALAVkhcAAABABPD5fCH9OLuy03y6MxY7jRcAAACIJCQvAAAAAJvbtGmTfvCDHxz2x3300UeaN29eL4woNObMmaM5c+a0v//iiy/qrrvuCno/M2bM0A033HBYH/Pmm2/q+uuvb39/1apVGjVqlFatWiVJ2rFjh+bNm6cvv/wy4PGVlJRo1KhReumllwK+FgAAABApYsI9AAAAAABf79///rfWrFlz2B/34osvavPmzb0wotC47bbbOr3/8MMPa8qUKWEaTWdPPfVUp/fHjh2rF154QXl5eZKkFStW6J133gnDyAAAAIC+geQFAAAAAFvyJwIiQXJysiZOnBjuYQAAAAB9BmWjAAAAgDBbv369fvSjH2ny5MmaNGmSfvzjH2vt2rWSpEWLFunBBx+UJI0aNUqLFi2SJO3du1e//vWvNX36dI0bN05TpkzRFVdcoZKSEknSDTfcoL///e/68ssvO5Ucamxs1N13362TTjpJ48aN06xZs/Tqq68edGyNjY2aPHnyAeWaWlpaNHXqVP3mN7855BwOpqmpSQ888IBmzpyp8ePH68wzz9Tf//739vZ9y0bNmDFDX375pf7+979r1KhR+uyzz5Sfn6/77ruv0zXr6+s1efJkPfzww1/b99cpKSnRddddpxNOOEFjx47Vscceq+uuu04VFRXt4/rggw/0wQcftJeK2rds1EsvvaQbb7xRkjRz5sz2klT7fv38Fi1apFGjRnWKvfbaazrrrLM0fvx4nXPOOdqwYcMBY6ysrNStt96q4447Tvn5+TrvvPO0cuXKHs8ZAAAAsBuSFwAAAEAY1dTU6Cc/+YnS09O1aNEi3X///aqvr9fFF1+s6upqfe9739N3v/tdSdILL7yg733ve/L5fLr00kv13nvv6Re/+IWeeOIJ/fSnP9XKlSvbSy1dfvnlOumkkzRw4EC98MILOvnkk+Xz+XTFFVfo+eef14UXXqiHH35YkyZN0tVXX62XX365y/HFxcXp1FNP1b/+9a9Oh0+/9957qqio0Nlnn33IORzML37xCy1evFjf+9739Oijj+qEE07QDTfcoH/+858H/NsHH3xQAwcO1EknnaQXXnhBI0aM0CmnnKJly5Z1Gtfrr7+uuro6fec73+nBV8MkP+bOnavNmzfrtttu0xNPPKG5c+fqlVde0f333y/JlLMaM2aMxowZoxdeeEFjx47tdI2TTz5Z8+fPbx/35Zdf3u3+33rrLV111VUaNWqUHnroIZ122mm69tprO/2bxsZG/ehHP9Kbb76pq6++Wg8++KAGDx6sn/zkJyQwAAAA0GdQNgoAAAAIo6KiIlVUVGju3Lk68sgjJUk5OTl64YUXVFtbq8GDB2vw4MGS1F6WaOfOnUpISND111+vo446SpJ0zDHHaPv27XrhhRckSVlZWerXr59iY2PbP+69997Tf//7X91///06/fTTJUknnnii6uvr9fvf/15nnnmmYmIOvEU4++yz9be//U0fffRRe3+vvPKKcnJylJ+fr7Vr137tHFJSUg645saNG/Wf//xHN910k370ox9Jko499lh9+eWXWrVqlc4888xO/37MmDGKjY1Vv3792udz7rnn6tVXX9WqVas0depUSdLLL7+s4447TkOGDDn8L4ak4uJiDR48WHfddZeOOOIISdLUqVP1ySef6IMPPpBkylklJydLUpelovr166esrCxJksfj0bBhw7rd/0MPPaTx48frnnvukWS+PpJ07733tv+bpUuXasOGDfrLX/6iCRMmSJKmTZumOXPm6Pe//73+9re/HeasAQAAAPth5wUAAAAQRiNGjFC/fv102WWX6dZbb9Xrr7+uAQMG6Nprr21PWuxv0KBBeuaZZzR58mSVlJTovffe05IlS/Txxx+rqanpoH2tXLlSlmXppJNOUktLS/ufGTNmaNeuXdq0aVOXHzdlyhRlZmbqlVdekWRW/r/xxhs6++yzezyHjz76SJL0rW99q1N80aJF+p//+Z+v/6R95bjjjlNmZqaWLl0qSdqxY4dWrlypc845p1sf3xWPx6M///nPGjp0qIqLi/XOO+/oiSee0JYtW772cxsMDQ0N+uyzzzR9+vRO8dNOO63T+ytXrtTAgQM1duzY9q9ha2urpk+frvXr18vr9fbqOAEAAIBQYOcFAAAAEEZJSUl69tln9fDDD+tf//qXXnjhBcXHx+vss8/WLbfcotjY2C4/7h//+Ifuu+8+lZWVKS0tTR6PR/Hx8V/bV2VlpXw+X/vuiP2Vl5fL4/EcELcsS7NmzdKLL76oW265RcuXL1ddXZ1mzZrV4zlUVlZKkvr37/+1Y/46DodDs2fP1uLFi3Xbbbdp6dKlSk5O1je/+c0eX1OSFi9erEceeUSVlZUaMGCAxo0bp4SEhK8tgRUMXq9XPp9P6enpneIZGRmd3q+srNSuXbsOKFflt2vXLrnd7l4bJwAAABAKJC8AAACAMMvJydE999yj1tZWffrpp1q6dKmee+45ZWVl6Sc/+ckB/3716tW6/vrrNWfOHF188cUaNGiQJOnuu+9u39HQlZSUFCUmJuqZZ57psn348OEH/dizzz5bjz76qFatWqVXX31VRx99tIYOHdrjOaSmpkoyB4/vuztj8+bNqqys1OTJkw86ln3Nnj1bDz30kN59913961//0umnn664uLhufWxXli1bpt/97ne69tprNXv2bPXr10+S9LOf/Uzr1q3r8XX9WltbO71fV1fX/ve0tDQ5HA7t3r2707/xJ3r8UlJSlJ2drd///vdd9nE4ZaoAAAAAu6JsFAAAABBG//73vzV16lTt2rVLTqdTkyZN0q9+9SulpqaqtLRUktlhsK81a9aora1NV155ZXviorW1VStWrJAktbW1dflxU6ZMUV1dnXw+n/Lz89v/bNy4UQ899JBaWloOOs7c3FyNHTtWr7zyit555x2dddZZhzWH/fmTE2+99Van+O9//3v99re/7fJj9p+PJA0dOlTHHnusnnnmGRUWFmr27NkHnUN3fPTRR0pNTdVPfvKT9sRFbW2tPvroo/bP68HGcqixJicna+fOnZ1iH3/8cfvf4+LiNGnSJL322mudDiHf/3M0ZcoUlZWVqX///p2+ju+9954ef/xxOZ3O7k8YAAAAsCl2XgAAAABhdOSRR6qtrU1XXHGF5s2bp6SkJP3rX/9SdXV1+3kQ/l0K//znPzVhwgSNHz9eknT77bfr3HPPldfr1bPPPqsNGzZIMqv5k5OTlZqaqt27d+udd96Rx+PRSSedpKOPPlqXX365Lr/8cuXm5urTTz/VwoULdeKJJ7Y/rD+Ys88+W3fddZdiYmL07W9/+7DmsL/Ro0fr29/+tu655x41NDTI4/Ho3Xff1fLly/Xggw92+TGpqakqKCjQBx98oPHjx7eXyfrud7+ra665Rrm5ue0HWPfU+PHj9dxzz+l3v/udpk+frvLycj3xxBPavXt3p1JMqampWrNmjVauXKkxY8Z0OVZJev311zVt2jTl5ubq5JNP1iuvvKIJEyZo+PDheumll7Rt27ZOH3fNNdfoRz/6kX7605/q+9//vrZu3apHHnmk07+ZPXu2/vS
2024-12-08 22:51:37 +04:00
"text/plain": [
"<Figure size 1600x4500 with 3 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"\n",
"# Настройка стиля графиков\n",
"sns.set(style=\"whitegrid\")\n",
"\n",
"# Создание фигуры\n",
"plt.figure(figsize=(16, 45))\n",
"\n",
"# График 1: Категория vs Цена\n",
"plt.subplot(4, 1, 1)\n",
"sns.scatterplot(x=df_cleaned['state'], y=df_cleaned['city_latitude'], alpha=0.6, color='blue')\n",
2024-12-14 07:16:17 +04:00
"plt.title('state vs city_latitude')\n",
2024-12-08 22:51:37 +04:00
"plt.xlabel('state')\n",
"plt.ylabel('city_latitude')\n",
"plt.xticks(rotation=90)\n",
"\n",
"# График 2: Подкатегория vs Цена\n",
"plt.subplot(4, 1, 2)\n",
"sns.boxplot(x=df_cleaned['city'], y=df_cleaned['city_latitude'], color='green')\n",
2024-12-14 07:16:17 +04:00
"plt.title('city vs city_latitude')\n",
"plt.xlabel('city')\n",
2024-12-08 22:51:37 +04:00
"plt.ylabel('city_latitude')\n",
"plt.xticks(rotation=90)\n",
"\n",
"# График 3: Категория vs Подкатегория\n",
"plt.subplot(4, 1, 3)\n",
2024-12-14 07:16:17 +04:00
"sns.countplot(x=df_cleaned['city'], hue=df_cleaned['state'], palette='Set3')\n",
"plt.title('state vs city')\n",
2024-12-08 22:51:37 +04:00
"plt.xlabel('state')\n",
"plt.ylabel('Count')\n",
"plt.xticks(rotation=90)\n",
"\n",
"# Упорядочиваем графики\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Стандартизация данных для кластеризации"
]
},
{
"cell_type": "code",
2024-12-14 07:16:17 +04:00
"execution_count": 34,
2024-12-08 22:51:37 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" city_latitude state_AB state_AK state_AL state_AR state_AZ state_BC \\\n",
"0 -0.538951 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"1 -0.031639 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"2 -0.610081 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"3 -0.985300 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"4 -0.577732 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"\n",
" state_CA state_CO state_CT ... city_Winchester city_Winnsboro \\\n",
"0 1.0 0.0 0.0 ... 0.0 0.0 \n",
"1 0.0 0.0 0.0 ... 0.0 0.0 \n",
"2 0.0 0.0 0.0 ... 0.0 0.0 \n",
"3 1.0 0.0 0.0 ... 0.0 0.0 \n",
"4 0.0 0.0 0.0 ... 0.0 0.0 \n",
"\n",
" city_Winston city_Woodburn city_Woodland city_Woodland Park \\\n",
"0 0.0 0.0 0.0 0.0 \n",
"1 0.0 0.0 0.0 0.0 \n",
"2 0.0 0.0 0.0 0.0 \n",
"3 0.0 0.0 0.0 0.0 \n",
"4 0.0 0.0 0.0 0.0 \n",
"\n",
" city_Woonsocket city_Yarmouth city_Yelm city_Yuma \n",
"0 0.0 0.0 0.0 0.0 \n",
"1 0.0 0.0 0.0 0.0 \n",
"2 0.0 0.0 0.0 0.0 \n",
"3 0.0 0.0 0.0 0.0 \n",
"4 0.0 0.0 0.0 0.0 \n",
"\n",
"[5 rows x 636 columns]\n"
]
}
],
"source": [
"from sklearn.preprocessing import OneHotEncoder, StandardScaler\n",
"import pandas as pd\n",
"\n",
"# Выделяем числовые и категориальные признаки\n",
"numerical_cols = ['city_latitude']\n",
"categorical_cols = ['state', 'city']\n",
"\n",
"# Масштабирование числовых признаков\n",
"scaler = StandardScaler()\n",
"df_numerical_scaled = scaler.fit_transform(df_cleaned[numerical_cols])\n",
"\n",
"# Кодирование категориальных признаков с помощью OneHotEncoder\n",
"encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False) # sparse=False для удобства\n",
"encoded_data = encoder.fit_transform(df_cleaned[categorical_cols])\n",
"\n",
"# Создаем новые столбцы для закодированных категориальных признаков\n",
"encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_cols))\n",
"\n",
"# Объединяем числовые и закодированные категориальные данные\n",
"df_encoded = pd.concat([pd.DataFrame(df_numerical_scaled, columns=numerical_cols), encoded_df], axis=1)\n",
"\n",
"# Выводим результат\n",
"print(df_encoded.head())"
]
},
{
"cell_type": "code",
2024-12-14 07:16:17 +04:00
"execution_count": 35,
2024-12-08 22:51:37 +04:00
"metadata": {},
"outputs": [
{
"data": {
2024-12-14 07:16:17 +04:00
"image/png": "iVBORw0KGgoAAAANSUhEUgAABR8AAAP0CAYAAAAjkkunAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/GU6VOAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdeXycZb3///c9W2Yy2dp0IW1a6AJll6XsqyCIiAjigiJ+9cAPFTwqHBU9KsJREBU4CgKK4oZ4QBTZ3A6boKhA4agIWqAttGnTJk2aTGZf7vv3R0xImkkyk8w9933PvJ7nwcPTzGRyzcy9XNf7/lzXbViWZQkAAAAAAAAAKszndAMAAAAAAAAA1CbCRwAAAAAAAAC2IHwEAAAAAAAAYAvCRwAAAAAAAAC2IHwEAAAAAAAAYAvCRwAAAAAAAAC2IHwEAAAAAAAAYAvCRwAAAAAAAAC2IHwEAACocZZlOd0ER7jpfU/XFje11Ul8DgAA1B7CRwAAMGPnnnuuVq1aNe6/fffdV8cff7yuuOIKDQ4OTvidDRs26PLLL9cb3vAG7b///jr++ON1ySWX6J///Oekf+e///u/tWrVKn3xi1+ctk0f+MAHdOihhyqbzU76nLe85S0655xzJEmrVq3SDTfcUMK7raxPf/rTOuGEE0b/fcIJJ+jTn/50Rf/G1q1bdcEFF2jz5s22/p3ZiMfj+tCHPqTXve51OuSQQ/TKK6+U/RpPPvmkVq1apSeffFJS8fddCXfffbdWrVqlrq6ukn8nFovpU5/6lNasWTP6s3PPPVfnnnvu6L/vuusufeUrX6lIG3fermbzOjvv22P/+81vflOB1r4mm83qqquu0v3331/R1wUAAM4LON0AAADgbXvvvbe+8IUvjP47l8vp+eef13XXXad//OMf+p//+R8ZhiFJ+t///V996lOf0u67764Pf/jD6uzs1NatW/XDH/5Q73znO3XzzTfrqKOOGvf6pmnqnnvu0R577KF7771Xn/jEJxSJRCZtz1lnnaU//vGPevzxx/WGN7xhwuPPP/+8XnzxxdGw584779Quu+xSiY9iVr75zW+qqampoq/5xz/+UY899pjtf2c27rnnHj366KO67LLLtPvuu6uzs7Ps19hnn3105513auXKlZKKv2+n/OMf/9C9996rs846a/RnY/cXSbr55pt16KGHVrtp05o/f76++c1vFn1st912q+jf6unp0Q9/+EN9+ctfrujrAgAA5xE+AgCAWWlqatIBBxww7meHHHKIEomErr/+ev31r3/VAQccoI0bN+rSSy/VMccco69//evy+/2jzz/55JP17ne/W5deeqkeeeQRhUKh0cf+8Ic/aOvWrbruuuv03ve+Vw888IDe8Y53TNqek046Sa2trbrvvvuKho+/+MUv1NTUpDe+8Y2SNKHtTtl7771r6u+UamBgQJL0nve8ZzSkLlexbdDNRkJStwuFQp76XAEAgDsx7RoAANhi3333lSRt2bJFknTbbbcpm83qc5/73LjgUZIikYguvfRSnXXWWROmav/85z/XHnvsoYMPPliHHXaY7rzzzin/bkNDg0477TT97ne/UzweH/dYLpfTL3/5S735zW8erZ7cedr1D3/4Q51yyinab7/9dMwxx+jyyy8ffZ2uri6tWrVKd99997jX3Xmqa6FQ0C233KLTTjtN+++/vw444ACdffbZ+vOf/zxpu8dOh77hhhsmne460tbp/sbdd9+tz3zmM5KkE088cfS1d552PTQ0pC9/+ct6wxveoP3220+nnXaafvazn01o2/XXX6+vfOUrOvLII7X//vvrvPPOm3aKdCaT0Y033jj6eZ588sm65ZZbZJqmpOHpxyPvZ88995xyOvhf/vIX/du//ZsOOuggHX744brkkku0bds2SeOnXRd731/5yle0//77a2hoaNxr3nTTTTr44IOVSqWmfB9Tueuuu/S2t71NBxxwgPbff3+99a1v1a9//evRdr3vfe+TJL3vfe8bnWo9dtr1CSecoM2bN+sXv/jF6JTuke9/Zztvq4ODg/rMZz6jQw89VIcccoi+9rWvjX62Yz300EN629vepv32209HHXWUvvSlLymZTM74Pc/k9R966CG95z3v0YEHHqh9991Xp5xyim6//XZJw/vViSeeKEn6zGc+M7ov7Tw9XZo4xf7uu+/W3nvvrbvuuktHHXWUDj30UL388stVed8AAKA0hI8AAMAWGzZskCQtWbJEkvT73/9ee++9txYuXFj0+UcccYQuvvhizZ8/f/RnAwMDeuSRR3TGGWdIks4880w999xzev7556f822eddZYymYx++9vfjvv5448/rv7+/kkrJx944AF97Wtf0znnnKNbb71VF110ke69996S1poc65prrtFNN92kd73rXfrud7+rL37xixoYGNDHPvaxkoKud7zjHbrzzjvH/XfwwQcrGo3q1FNPLelvHH/88frwhz8saXiq9YUXXjjh76TTab3nPe/R/fffr/PPP380jPvsZz+rb33rW+Oe+6Mf/Ujr16/Xl7/8ZX3pS1/S3//+d1166aWTvgfLsvShD31I3/3ud/WOd7xD3/rWt3TKKafo61//+ui04y984Qt6+9vfLml4+nuxNkrSCy+8oPe+973KZDL66le/qiuuuEJ///vfdd555ymfz497brH3/fa3v12ZTGbCOoX33nuvTj311Cmn8U/l9ttv12WXXaY3vOEN+va3v61rrrlGoVBIn/jEJ7R161bts88+uuyyyyRJl1122YTp1iNtnD9/vo477jjdeeedWrBgQUl/2zRNnX/++Xrsscd06aWX6uqrr9azzz6rX/3qV+Oed//99+uiiy7S8uXLdeONN+ojH/mI7rvvPl144YUl3dwln89P+G/s75Xy+r/73e900UUXaZ999tFNN92kG264QUuWLNF//dd/6a9//asWLFgwOr37wx/+8KRTvSdTKBT0ve99T1deeaU+85nPaMWKFbN+3wAAoHKYdg0AAGbFsqxxAdDg4KCeeuop3XzzzaNVTtLwTUD22muvsl77/vvvl2maeutb3yppeHr2f/3Xf+mOO+6YMhDcZ599tNdee+n+++8ft9bePffco1WrVmm//fYr+ntPPfWUOjs7dc4558jn8+nQQw9VY2Nj0RvnTKWnp0cXX3zxuKqthoYG/fu//7vWrl077VTWXXbZZdw6lD/4wQ/07LPP6pvf/KZWrFhR8t9YunSpJGmvvfYqupbi3XffrRdffFF33HGHDjzwQEnSMccco3w+r5tuuklnn3222traJEktLS266aabRqtWN27cqBtuuEE7duzQnDlzJrz2448/rj/+8Y+67rrr9OY3v1mSdNRRRykcDusb3/iG3ve+92n33XcffZ9TfSbf+ta31NbWpu9973tqaGiQJC1YsED/8R//oZdeemncc+fOnVv0fR944IG69957R4PnZ599Vq+88oquvvrqSf/udDZt2qTzzjtvXGi6ePFive1tb9MzzzyjN7/5zaNTrFeuXFl0uvXee++tUCikuXPnljXF+fHHH9ff/vY3fec739Gxxx4raTjAH1uBa1mWrrnmGh1zzDG65pprRn++22676f3vf78ee+wxHX/88ZP+jc2bN2ufffaZ8PP/+I//0AUXXFDy67/88ss688wz9dnPfnb0OQceeKAOO+wwPfnkk3rd6143emxYunTpjJYG+NCHPjT6Xmb7vgEAQGURPgIAgFl5+umnJwQUPp9PRx55pP7rv/5rdB0/v9+vQqFQ1mv//Oc/12GHHaZQKKRYLCZpeJrqAw88oEsvvXTKG6ecddZZuuqqq7Rt2zYtXLhQAwMDevTRR/WpT31q0t85/PDDdeedd+ptb3ub3vCGN+i4447TW97ylrLXIrz22mslSf39/Vq/fr1effVVPfroo5I05V24i/n973+vr371q7rwwgvHrWFZib/x1FNPafHixaPB44jTTz9dP/vZz/TXv/5Vxx13nCRpv/32GzddfiQ0TKVSRcPHp556SoFAQKeccsqE1/7GN76hp556SrvvvntJ7XzmmWd
2024-12-08 22:51:37 +04:00
"text/plain": [
"<Figure size 1600x1200 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from sklearn.decomposition import PCA\n",
"import matplotlib.pyplot as plt\n",
"\n",
"# Применение PCA ТОЛЬКО к числовым данным\n",
"pca = PCA(n_components=1)\n",
"kc_pca = pca.fit_transform(df_numerical_scaled)\n",
"\n",
"# Визуализация\n",
"plt.figure(figsize=(16, 12))\n",
"plt.scatter(range(len(kc_pca)), kc_pca, alpha=0.6)\n",
"plt.title(\"PCA Visualization of city_latitude Feature\")\n",
"plt.xlabel(\"Sample Index\")\n",
"plt.ylabel(\"Principal Component 1\")\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Агломеративная (иерархическая) кластеризация"
]
},
{
"cell_type": "code",
2024-12-14 07:16:17 +04:00
"execution_count": 36,
2024-12-08 22:51:37 +04:00
"metadata": {},
"outputs": [
{
"data": {
2024-12-14 07:16:17 +04:00
"image/png": "iVBORw0KGgoAAAANSUhEUgAABSAAAAPyCAYAAABsM3EPAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/GU6VOAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdeZxd8/0/8NdEMtYgYok1JLGT0BAUQahq0Ta0qnaqtqC1lKD2tZogxN4WsVQVRVutFhW+RVI7rVIJQYgtklgzWe7vj/zmdiYzicmYk7mTeT4fjzzOzPmce877fc+ZmcxrzlJVKpVKAQAAAAAoQIfWLgAAAAAAWHAJIAEAAACAwgggAQAAAIDCCCABAAAAgMIIIAEAAACAwgggAQAAAIDCCCABAAAAgMIIIAEAAACAwgggAQAAaFSpVGrtEsqaUksl1QvA/wggARYAgwcPztprr93ov8GDB7d2eUAdkydPTt++ffPcc89l8uTJOfzww/PrX/+6tcuiAjz33HP5+te/npqamtYupSI88MAD2W233fLWW2/lzTffzIABA/LCCy+0dlkV780338zaa6+dyy67LHfeeWfWXnvtjBo1qkmvHTx4cAYMGFD+/IEHHsiJJ57Y4jXuu+++2XfffefpNU8++WQOOeSQ8ue1fd55551JkilTpuSEE07IE0880SI11r6HTTVq1KhyPZdddlnWXnvtvPnmm6mpqclOO+2UZ555pkXqAmirOrZ2AQC0jOWWWy7Dhw+vN+/II49spWqAOVlqqaVy4IEHZo899kipVMraa6+dn//8561dFq1s6tSpOfHEE/PTn/401dXVrV1ORdhmm21yyy23ZLvttkuSDBw4MBtssEErV1X5qqqqytO6HzfFEUcckf3226/8+fXXX9/i9TXX7373u4wZM6b8+fLLL5/f/va3WW211ZIkL774Yu6+++7svvvurVLfnN736urqHH/88TnxxBNz9913Z5FFFmmV+gBamwASYAEwY8aMLLbYYtloo43qzfdLLFSmI488MnvuuWemTJmS7t27Z6GFFmrtkmhlt9xySzp27JgddtihtUupGB07dsyvfvWrvPHGG1looYWy0kortXZJbcJyyy2Xjh07ZoUVVki3bt2SJCuuuGKTXlsb5rUF1dXVDf7f05pq3+Nu3bplxowZ6dixY5ZbbrkkyQ477JBLLrkkv/nNb3LggQe2ZpkArcYl2AALgOnTpzf5L+pPPPFE9tlnn/Tp0yf9+vXLiSeemIkTJ5bHay/XevPNN+u9bsCAAfUu5542bdocL/uefV3PPvtsBg4cmN69e2fXXXfNX/7yl3rr/uijj3L++ednhx12yIYbbphddtklt99+e4Ptz76dN998M/vuu28GDx6cq666Kl/96lfTt2/fHHHEERk/fny9199///3Za6+9svHGG2eDDTbITjvtlJtvvrk8Xnvp1Nprr50nn3yy3mtvuummrL322vUuS6ut52c/+1m9ZSdPnpwNNtigwSVvX7T9Ofnd736X3XbbLRtttFF69+6db3/72/nzn//c4D1u7LL7Oe2f2S97u/fee7Pbbrtl4403zpZbbpnTTjstkydPLo/XXkq28cYbN7g09Oijj25wqf/UqVNz4YUXZptttskGG2yQXXfdNffee2+91w0YMCAXX3xxzjvvvGy66abZbLPNcsIJJ2TSpElN7n9utx648847y/u07n744IMPsskmmzS6L9dee+2ss8462XTTTXPUUUflww8/LC/T2KV4te9Lc97LJFl22WXTo0ePPProo194u4TZt/WnP/0pm266aYYOHZqk/vE7+7+6df/nP//JkUcemc033zzrr79+tt5665xzzjn5/PPPy8vU1NTkkksuyfbbb5/evXtnl112ye9///smvedJ8tZbb+XYY49Nv3790qdPn+y///7597//XV5/7WWTf/rTn3LYYYelT58+2XbbbXP55Zdn5syZ9fbL7O/JscceW2+flkqlDBs2LFtvvXX69u2bww47LG+//XZ5+RkzZuSaa67JLrvskt69e2ejjTbKnnvumccff3yu+zFpuM9n/7xUKmXPPfes9/1y9stXk+TWW2/9wks5a2pqct1112WXXXYpz2vsEtXZj+nGan/kkUcaHE8ff/xxzj777Gy99dbZaKONsvvuu+ehhx5qsN4vOn6mTp2ayy+/PDvttFM23HDD7Ljjjrnmmmvq7bd999233uu/8pWv5KCDDsobb7wxz+up7X/VVVfNSiutlKFDh9Y71hoz+/Zn/5mRzNpP++67b26//fZst9122XjjjbP//vvnP//5T3k9jf0s/O9//5v111+/3n558cUXs/fee2fjjTfODjvskFtvvXWO+6ux3pJk4sSJOfPMM7Pddttlgw02SL9+/TJo0KB62579Neeee2423HDDPPzwww3eg+rq6vTo0SO9evXKOuusk8UWWyyrrLJKklnH7fXXX59vfOMb6d27d772ta/lV7/6Vfm+iXWP4X333TejR4/O6NGjs/baa+fRRx/NVlttleOOO67BNnfccccGPw/nxRe9B4MHD87vf//7jB8/vnwM1L0Ee9SoUeUzN/fbb7/ye9XY95HG9u3o0aPz/e9/P3369MnXv/71PProow1q/KKfbausskoWX3zx9OzZM+uss066d+9e7w/Bu+66a6677jq3WADaLWdAAiwAPvvssyy11FJfuNw///nPHHjggdl8881zySWXZPLkyRk2bFj222+/3H777fN0WdDUqVOTJFdeeWWWWWaZJLPCotmDwyQ59NBDs88+++SYY47J7bffnp/85Ce5+uqrs8022+Tzzz/PXnvtlQ8++CBHH310Vl555dx///055ZRT8v777+ewww4rr2ebbbbJEUccUf58+eWXTzLrHlVdunTJz372s8ycOTNDhw7Nvvvumz/96U9ZdNFF89BDD2XQoEHZb7/9ctRRR+Xzzz/PLbfckrPOOisbbLBB+vTpU17n4osvngcffDB9+/Ytz7v33nvToUPDv9ktvvjieeihh1IqlcqXW/31r3/NjBkz6i03L9uv6+abb84555yTo446Kn379s3kyZNz7bXX5vjjj8/GG29cPrMlSYYPH14+06J2fyTJd7/73Xzve98rf37mmWfW28YVV1yRSy+9NHvttVeOOeaYvPHGGxk2bFieeeaZ3HbbbfWOiaqqqjz22GPZZpttkiSffPJJRo4cWe+9KZVKGTRoUJ566qkcffTR6dmzZ/72t7/lmGOOSU1NTb7zne+Ul73lllvSvXv3nH/++Zk4cWKGDh2acePG5dZbb01VVdUX9n/EEUdkzz33TDLrjML11luvfHysttpq+e9//9vgPR06dGg++uijLLnkkvXm1x5b06ZNy5gxY3LhhRfm3HPPzZAhQxrdN42Zl/ey1rRp03Leeec1eRtJ8vnnn+ess87KwQcfnF133bXe2GmnnZb111+//Pn3v//98sfvvvtu9t5772y00Ua54IILUl1dnYcffjjXXXddll9++fK91Y4//viMHDkyhx9+ePr06ZORI0dm8ODB6dSp0xe+5xMnTsyee+6ZRRddNKeeemoWXXTR3HDDDdl7771z++23p2fPnuV6zjjjjGyzzTa57LLL8uSTT2b48OH59NNP89Of/rTRvp944on86U9/qjfv+uuvz9VXX50TTjgha6yxRi644IL8+Mc/zm233ZYkGTJkSH7zm9/kuOOOy9prr5133nknl19+eX784x/noYceyqKLLjpP731dd999d55++um5LjN58uRccsklX7iuUaNG5Z133smOO+7Y7HqSxo+nGTNm5KCDDsprr72Wo48+Oj169Mjvf//7DBo0KDfccEM22WST8rJzO35KpVIOO+ywPPPMMznyyCOzzjrrZNSoUbnkkkvyxhtv5Oyzzy4vu9566+X000/P9OnT8+abb2bo0KE54YQT8pvf/Gae1lPX66+/3uTLgWu3X+uhhx7KlVdeWW+ZF198MWPHjs2xxx6bpZZaKpdeemn22Wef3HvvveWfLbM799xzM3369PLnn332WX70ox9l5ZV
2024-12-08 22:51:37 +04:00
"text/plain": [
"<Figure size 1600x1200 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
" 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
" 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
" 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
" 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
" 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
" 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
" 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
" 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
" 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
" 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
" 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
" 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
" 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
" 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
" 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
" 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
" 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
" 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
" 1 1 1 1 1 1 1 1 1]\n"
]
}
],
"source": [
"# Построение дендрограммы (только для числовых данных)\n",
"linkage_matrix = linkage(df_numerical_scaled, method='ward')\n",
"\n",
"plt.figure(figsize=(16, 12))\n",
"dendrogram(linkage_matrix)\n",
"plt.title('Дендрограмма агломеративной кластеризации (числовой признак \"city_latitude\")')\n",
"plt.xlabel('Индекс образца')\n",
"plt.ylabel('Расстояние')\n",
"plt.show()\n",
"\n",
"# Получение результатов кластеризации (только для числовых данных)\n",
"result = fcluster(linkage_matrix, t=100, criterion='distance') \n",
"print(result) # Вывод результатов кластеризации (номера кластеров для каждого образца)"
]
},
{
"cell_type": "code",
2024-12-14 07:16:17 +04:00
"execution_count": 37,
2024-12-08 22:51:37 +04:00
"metadata": {},
"outputs": [
{
"data": {
2024-12-14 07:16:17 +04:00
"image/png": "iVBORw0KGgoAAAANSUhEUgAABJ8AAAMQCAYAAACJzMTyAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/GU6VOAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdeZxVdf0/8NcdZgY3EFIURQHRRA1x+apppblVmuVXs0VTU8ufkksuqWlpufQ1U9wpl8xc0hZz/2b5zSWt3LfS3BIQFQVXQBSYhfv7A2ecgQHmDnOYGXg+Hw8enDn3vO9533vP3Ln3dT/nc0vlcrkcAAAAAChAVVc3AAAAAMCSS/gEAAAAQGGETwAAAAAURvgEAAAAQGGETwAAAAAURvgEAAAAQGGETwAAAAAURvgEAAAAQGGETwAAAAAURvgEQMX23Xff7Lvvvm1edvPNN2f48OE5/vjjF3NXQEeNGjUq1113XVe3QSf5wx/+kIMOOqir2wCAZsInADrNm2++mdNPP72r2wAqcMMNN2Ty5MnZY489uroVOskee+yRN954I3/4wx+6uhUASCJ8AqATnXrqqXn//fez3HLLdXUrQDvMnDkzo0ePzqhRo1JV5WXhkqJUKuXggw/OOeeck5kzZ3Z1OwAgfAKgc/zf//1fbr/99hx66KHp379/q8tmz56dSy+9NJ/5zGcyYsSIfO5zn8vVV1/dapt99903xx9/fC6++OJ84hOfyH/913/lkEMOycSJE1ttd8cdd+TrX/96Ntlkk4wYMSI77bRTrrnmmubLH3zwwQwfPjx///vfs/fee2fkyJH57Gc/m2uvvbZ5m5/85CcZPnx4HnjggeZ1N9xwQ4YPH56bbrqpuZ+5Ty08++yzM3z48Nxwww1JkuHDh+fCCy9stc2FF16Y4cOHz9Pzl770pWy44Yb55Cc/mR//+Md5//33W23zxBNP5Jvf/GY23XTTbLnlljn66KMzefLkVrfpwQcfTJI8//zz2XHHHbPnnnu2+35JkksvvTQ77LBDNthggwwfPrz539y3oaXjjz8+22+/ffPPl19+eTbZZJP89re/bXW/tfWv6X5Kkocffjjf+ta3svnmm2fEiBHZfvvtc+GFF2b27NnN20yfPj2nnXZatt5662y88cbZY4898te//rX58ZjfflreLwcffHA23XTTbLrppjn00EPz8ssvN19/e46NZN7HtVwuZ88998zw4cPzyiuvJElmzZqVk08+OVtttVU+/vGP55hjjsnUqVOba2bOnJmzzz47n/3sZzNixIhsuummOeCAA/LMM8/M975NkldeeaXVfTf3z0373mGHHVodZ//5z3+a79+5758Fuf766zNr1qxst912rdafe+65bd7Xcx8r1113XXbZZZeMGDEi2267bS688MI0NjZWdBuT5B//+Eeb+2v5O3j88cdn3333zR/+8Idst9122WSTTbLffvvl2WefbXX9L774Yr7zne/kk5/8ZDbeeOPsu+++efTRR+fZf8t/TT0OHz48v/71r/O9730vm2yyST7xiU/kf/7nfzJr1qzm+sbGxlx66aX5whe+kJEjR2bjjTfOnnvu2er5pOl5YJNNNkldXV2r/r7zne+0OjW5ZT+33HJLq23vvvvueR7H9uw/SbbbbrvMmjUr119/feZnQb+/LR/r119/PSeccEI+/elPZ+TIkfnyl7+cO++8c77X2/K6m35nmmy//fatTstu79+HuZ+P535eTBb+HABA16nu6gYA6PmmTJmSU045JR/72Mdy4IEH5ve//32ry08++eTccMMNOfjgg7PJJpvk4Ycfzumnn55p06bl0EMPbd7uzjvvTP/+/XPiiSdm9uzZOfvss7Pvvvvmj3/8Y5Zddtn89a9/zaGHHppvfOMbOfzwwzNz5sxce+21OfXUUzNixIhstNFGzdd11FFHZbfddsuoUaNy55135pRTTkmSfP3rX89RRx2Vv/71r/nRj36UW2+9NW+++Wb+53/+JzvvvHN22223Nm/jSy+9lCuuuKLi++bWW2/NMcccky9+8Ys58sgjM3HixJx77rl54YUX8qtf/SqlUilPP/109tlnn2y00UY588wz09jYmLPPPjvf+ta3msOwls4666yMGDEi3/72t5OkXffLTTfdlLPPPjsHH3xwttpqqyy77LJJkq997Wvtvi2TJ0/OOeeck1NPPTWf/vSnW102ZsyYDBgwIEnyxhtv5LDDDmu+7Nlnn83++++fnXbaKeeee27K5XJuvfXWjBkzJsOGDcsuu+ySxsbGfPOb32wODoYNG5Ybb7wxhx56aK688sr86Ec/yvTp05t7/vKXv5yvfOUrSZJ11lkn48ePz5577plhw4blpz/9aRoaGnLRRRdlr732ys0335yVVlqpuZ8FHRttufnmm/P444/P8xjcdNNNOemkk9K3b9+ccsopOfnkk3PuuecmSY477rg88sgjOfroozN48OBMmDAh559/fr773e/mj3/8Y0qlUrvv97lddtll87yh//a3v53a2tqcdtppWWWVVVJVVZXrrrtuoadd3XLLLdl2221TW1vbav3MmTOz/fbb5+CDD25eN/excskll+Tcc8/NPvvskxNOOCHPPPNMLrzwwrz22msVn347c+bMDBw4MOeff37zuqbHpaVnnnkm48aNy9FHH50VV1wxF1xwQfbZZ5/cdtttWWWVVfLCCy/kq1/9aoYOHZoTTzwxNTU1ueqqq7Lffvvl8ssvzxZbbNHqPtt2222TpNXtP//887PRRhvlvPPOy9ixY3PeeefljTfeyHnnnZckGT16dH7zm9/ku9/9boYPH57JkyfnZz/7WY444oj89a9/bf7dSuaMQLr//vubf1/ee++93HPPPW2OMlt++eVz1113Zdddd21ed9ttt6WqqqpVSNve/ffu3Tvbbbddbr311uy9994LvP9b/v4mrR/rN998M1/+8pfTu3fvHHXUUenfv39uuOGGHHrooTnzzDNb9dsR7f37sDCVPAcAsPgJnwBYZKeffnqmTp2aX/7yl6mubv2nZfz48fn973+fo48+unkC3E996lMplUq55JJL8vWvf715pNSMGTNyww03ZM0110ySDBs2LLvvvntuuumm7LXXXnnhhRey++675wc/+EHz9W+yySb5+Mc/ngcffLBV+PSZz3ymebutt946r7/+en7+859nr732yjLLLJMzzjgjX//613PppZfmscceyworrNDmm92Wt/GjH/1o/v3vfzevq6qqSkNDw3xryuVyRo8ena233jqjR49uXj906NDsv//+ueeee7Ltttvm4osvTr9+/XL55Zend+/eSZJVVlkl3/3ud/Of//yn1XVOmDAhf//733PLLbfkox/9aJK0637517/+lX79+uXoo4+eb78L89vf/jbrrbdevvSlL81z2frrr5811lgjSeYJRp599tl84hOfyFlnndX8pvuTn/xk7rrrrjz44IPZZZddcu+99+af//xnfvazn2XHHXdMkmy55ZZ5+eWX88ADD7QKs5Jk4MCB2XjjjZt//tGPfpRll102V1xxRVZYYYUkyVZbbZUdd9wxl112Wb73ve81b7ugY2PuUOi9997L6NGj87GPfazVY18ul3Pcccc1z5P02GOPNU/YXVdXl/feey8nnnhiPv/5zydJtthii0yfPj1nnHFG3nzzzVZv9Cvx2muv5Re/+EWrft5+++28/PLLOemkk7LTTjs1b/u3v/1tgdc1ffr0PPnkk9l5553nuWzGjBlZffXVW93HLb377rv5+c9/nq997Ws58cQTk8z5ve7Xr19OPPHEHHDAAc3HZ3vMmDEjffv2bbW/psdx7v1efPHF2WyzzZIkI0eOzI477pirrroqxxxzTMaMGZPa2tpcddVVzfXbbrttvvCFL+TMM89sFcYNHjy4zdv3kY98JBdffHGqq6vz6U9/OlVVVfnJT36Sww8/PGuvvXZef/31HHXUUa1G4vTu3TuHH354nnvuuVbXuc022+TOO+9sDp/uuuuuDBgwoFWY1HLbv/3tb6mrq0ttbW1mzZqVO++8M5tvvnm
2024-12-08 22:51:37 +04:00
"text/plain": [
"<Figure size 1200x800 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Выбираем подмножество данных для кластеризации\n",
"features = df_encoded[['city_latitude']]\n",
"\n",
"# Масштабирование числовых признаков\n",
"scaled_features = scaler.fit_transform(features)\n",
"\n",
"# Построение дендрограммы\n",
"linkage_matrix = linkage(scaled_features, method='ward') \n",
"\n",
"plt.figure(figsize=(12, 8))\n",
"dendrogram(linkage_matrix, labels=df.index, leaf_rotation=90, leaf_font_size=10)\n",
"plt.title('Иерархическая кластеризация (дендрограмма) по цене')\n",
"plt.xlabel('Индекс товара')\n",
"plt.ylabel('Евклидово расстояние')\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Визуализация распределения кластеров**"
]
},
{
"cell_type": "code",
2024-12-14 07:16:17 +04:00
"execution_count": 38,
2024-12-08 22:51:37 +04:00
"metadata": {},
"outputs": [
2024-12-14 07:16:17 +04:00
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Danil\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python312\\site-packages\\joblib\\externals\\loky\\backend\\context.py:136: UserWarning: Could not find the number of physical cores for the following reason:\n",
"found 0 physical cores < 1\n",
"Returning the number of logical cores instead. You can silence this warning by setting LOKY_MAX_CPU_COUNT to the number of cores you want to use.\n",
" warnings.warn(\n",
" File \"C:\\Users\\Danil\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python312\\site-packages\\joblib\\externals\\loky\\backend\\context.py\", line 282, in _count_physical_cores\n",
" raise ValueError(f\"found {cpu_count_physical} physical cores < 1\")\n"
]
},
2024-12-08 22:51:37 +04:00
{
"data": {
2024-12-14 07:16:17 +04:00
"image/png": "iVBORw0KGgoAAAANSUhEUgAACbQAAAuoCAYAAAAwk66tAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/GU6VOAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdfbzX8/0/8MfpdE4XKolVSCJTQtEUhi7McjVjNMwSJrnINbnY1zDMjMxlLie+hrkeNsZcfTFMai5mis3ValSukurUqdP5/eHm/BzVEOdz3h33++22W31e79fn9Xq+3588/9jtcXu/ympra2sDAAAAAAAAAAAAjaxZYxcAAAAAAAAAAAAAiUAbAAAAAAAAAAAABSHQBgAAAAAAAAAAQCEItAEAAAAAAAAAAFAIAm0AAAAAAAAAAAAUgkAbAAAAAAAAAAAAhSDQBgAAAAAAAAAAQCEItAEAAAAAAAAAAFAIAm0AAAAANJja2trGLqHO56mlSPU2pmV9Dk3t+TW1+wEAAABYHgi0AQAAAPCVOOGEE7L11lvXfX7wwQdz/PHHf+X77L333tl7772/0HcmTpyYkSNH1n2eOnVqevTokdtvvz1JMmvWrBx33HGZMGHCV1Jjjx49ctFFF30la5Xap5/V5/VV/97z58/PNddck9122y3f+ta30r9//+y5556544476gXNLrroovTo0eMr2zdJqqurc+aZZ+YPf/jDV7ouAAAAAJ+teWMXAAAAAEDTcMghh2T48OF1n6+55prGK+ZTbrnllrzyyit1nzt27JibbropXbt2TZJMmjQpd955Z3bbbbfGKrEwPv2sPq+v8vd+5513MmLEiLz11lvZe++907t37yxatCgPP/xwTjjhhEyYMCGnn356ysrKvrI9P2nGjBn53//93/zyl79skPUBAAAAWDqBNgAAAAC+Eh+Hw5YHlZWV2WijjRq7DJbi+OOPz7Rp03LTTTelW7dudeODBg3Kaqutll//+tcZPHhwvvOd7zRekQAAAAA0CEeOAgAAAPC51NbW5pprrsn222+f3r1757vf/W6uuuqquuMfP3nk6N57753x48dn/Pjx6dGjR5544olsueWWOeaYYxZbd8iQITnppJOWua733nsvP//5zzN48OBssMEG6d+/f0aNGpWpU6fW1fX73/8+//nPf+qOGf3kkaNPPfVU3Zvlhg8fXnec6dZbb50TTjih3l633357evToUbd2kowfPz577LFH+vTpk2233TZPPPHEYjXOnz8/Z599dgYOHJgNNtggO+20U+65556l3tP8+fPzrW99K7/61a/qjS9cuDCbbbZZzjjjjCTJCy+8kH322Sff+ta3svHGG2fffffNs88++1+f1+OPP57dd989G2+8cfr165eDDz647o1sS3pWyUdHtB533HHZcssts/7662fzzTfPcccdl/fffz/J4r/3U089lSSZOXNmTj755Hz729/OhhtumN133z1PPvnkf61v0qRJ+ctf/pL999+/XpjtY/vuu29+/OMfp3Xr1kv8/uf53ebNm5dTTz01AwYMyAYbbJDtttsuV111Vd29fhyUO/HEE+sdozthwoQMGzYsffr0Sf/+/XP88cfnvffeq7dPr169csstt2SLLbZI//79869//Sv//ve/c9BBB2XTTTdNnz59sscee+SRRx75r88BAAAA4OtKoA0AAACAz+Xss8/O2Wefna233jqXXXZZhg4dmjFjxuSKK65YbO4pp5ySXr16pVevXrnpppvSu3fv7LLLLnnggQcye/bsunkTJ07MG2+8kV133XWZaqqtrc2BBx6Yxx9/PMcee2yuuuqqHHrooXnyySdzyimnJPnoKNSBAwfmG9/4Rm666aYMGjSo3hrrr79+Tj755CTJySefXPe9z+Mf//hHfvKTn6Rt27a58MILM3z48Bx99NGL1Thq1KjceOON2W+//XLppZdm4403zlFHHZU77rhjieu2aNEi2267bf70pz/VBQaTj8Jo77//fnbeeefMnj07I0aMyEorrZSLLroo5513XqqqqrL//vvnww8/XOK6U6ZMySGHHJINNtggl156aX7xi1/ktddey8iRI7No0aIlPquqqqoMHz48r7zySk455ZRcddVVGT58eO6+++6cd955SRb/vddff/3Mnz8/++yzTx588MEcddRRufjii9O5c+eMGDHiv4baHnvssSSpFyT79LM5+eSTs/nmmy91jc9y5pln5tFHH83xxx+fq666Kt/5zndy9tln57bbbkvHjh1z8cUXJ0kOPvjgur8//fTT2XfffdOyZcucf/75+elPf5rx48dn+PDhmTdvXt3aNTU1GTduXH7xi1/kxBNPzFprrZUDDzwwVVVVOfvss3PJJZekffv2Ofjgg/PGG28s8z0AAAAANFWOHAUAAADgM82aNSvXXntthg0bltGjRydJvv3tb+ftt9/O008/nQMPPLDe/HXWWSdt2rRJkrqjPXfbbbdceeWVue+++7LbbrslSe64445069Ytffv2Xaa6ZsyYkVatWuX444/PJptskiTZdNNN8+9//zs33XRTko+OQu3QoUO9Y0bnzp1bt0abNm2yzjrr1NX98d8/j8svvzwrr7xyLr300lRUVCRJVlpppRx11FF1c5544ok89thjOe+887LDDjskSbbaaqtUVVVlzJgx+d73vpfmzRf/v+l23nnn3HbbbZk4cWLdvd19991Ze+21s+GGG+bZZ5/N+++/n+HDh9c9v7XXXjs33XRT5syZk7Zt2y625vPPP5958+blwAMPTKdOnZIknTt3zoMPPpi5c+cu8VlNmjQpnTt3zq9+9ausscYaSZLNNtsszz33XMaPH1/33D79e998882ZPHlybr755vTp0ydJMmDAgOy9994ZM2ZMbrvttiU+07feeitJ0qVLl8/zEyyT8ePHZ4sttsiOO+6Y5KN/M61bt87KK6+cysrKrLfeekk++rfTq1evJMm5556btdZaK5dffnnKy8uTJH369MmOO+6Y2267LT/+8Y/r1j/ooIPqgpNvv/12Xn311bqwYJL07t07F198caqrqxvsHgEAAACWVwJtAAAAAHymZ599NgsXLsyQIUPqjX+Ro0LXWmutfOtb38qdd96Z3XbbLfPmzcuf/vSnHHDAActcV6dOnXLttdemtrY2U6dOzRtvvJFXX301f/vb30oSFpo4cWIGDx5cF2ZLPjpC9ePAU5I8+eSTKSsry8CBA7Nw4cK68a233jp33XVX/vnPf9YFqD6pf//+WW211XL33Xdnk002yfz58/PAAw9k5MiRSZJvfvOb6dChQw466KBst9122WqrrbLFFlvUBQ6XpE+fPmnRokWGDh2a7bbbLgMGDMimm26a3r17L/U76623Xm644YYsWrQor7/+et54443861//yquvvlrvfj7tySefzDe+8Y2sv/769eYNHjw4Z599dj744IOsuOKKi33v42dXU1Oz1LW/rE033TQ33nhjpk2bloEDB2bgwIEZNWrUUudXVVXlueeey/7775/a2tq6+1ljjTXSvXv3PP744/UCbZ/8PVdZZZWss846+dnPfpa//OUv2XLLLTNgwICceOKJDXZ/AAAAAMszgTYAAAAAPtPMmTOTJB06dPhS6wwdOjQ//elP89Zbb2XixImZM2dOdtllly+15l133ZVf//rXeeutt9K+ffust956admy5Zda8/P64IMPstJKK9Uba968eb2xmTNnpra2dqlvoZsxY8YSA21lZWXZaaedcsstt+Skk07Kww8/nLlz52annXZKkqywwgq5/vrrc+mll+ZPf/pTbrrpprRs2TI777xzTjrppFRWVi62ZpcuXXLdddfliiuuyK233pprr7027dq1y1577ZUjjzwyZWVlS6zx6quvzmWXXZaZM2dmlVVWyQYbbJBWrVot9WjTj+/77bffzvrrr7/E62+//fYSA22rr756kuTNN99c6tvypk+fno4dOy613s/yP//zP+ncuXPuuuuunH766Tn99NOz8cYb59RTT03Pnj0Xmz9r1qwsWrQoV155Za688sr
2024-12-08 22:51:37 +04:00
"text/plain": [
"<Figure size 2500x3000 with 4 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import pandas as pd\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.cluster import KMeans\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"\n",
"# Закодирование категориальных переменных\n",
"df_encoded = pd.get_dummies(df_cleaned, drop_first=True)\n",
"\n",
"# Выбор подмножества данных для кластеризации\n",
"features = df_encoded[['city_latitude']]\n",
"\n",
"# Масштабирование данных\n",
"scaler = StandardScaler()\n",
"scaled_features = scaler.fit_transform(features)\n",
"\n",
"# Кластеризация данных\n",
"kmeans = KMeans(n_clusters=3)\n",
"df_encoded['Cluster'] = kmeans.fit_predict(scaled_features)\n",
"\n",
"# Визуализация кластеров\n",
"plt.figure(figsize=(25, 30))\n",
"\n",
2024-12-14 07:16:17 +04:00
"# Парный график 1: city_latitude vs state\n",
"state_columns = [col for col in df_encoded.columns if col.startswith('state_')]\n",
"if state_columns:\n",
2024-12-08 22:51:37 +04:00
" plt.subplot(4, 1, 1)\n",
2024-12-14 07:16:17 +04:00
" sns.scatterplot(x=df_encoded['city_latitude'], y=df_encoded[state_columns[0]], hue=df_encoded['Cluster'], palette='Set1', alpha=0.6)\n",
" plt.title('city_latitude vs state Clusters')\n",
2024-12-08 22:51:37 +04:00
" plt.xlabel('city_latitude')\n",
2024-12-14 07:16:17 +04:00
" plt.ylabel(f'state ({state_columns[0]})')\n",
2024-12-08 22:51:37 +04:00
"else:\n",
" plt.subplot(4, 1, 1)\n",
2024-12-14 07:16:17 +04:00
" plt.text(0.5, 0.5, 'No state columns found', ha='center', va='center', fontsize=12)\n",
" plt.title('city_latitude vs state Clusters')\n",
2024-12-08 22:51:37 +04:00
"\n",
2024-12-14 07:16:17 +04:00
"# Парный график 2: city_latitude vs city\n",
2024-12-08 22:51:37 +04:00
"city_columns = [col for col in df_encoded.columns if col.startswith('city_')]\n",
"if city_columns:\n",
" plt.subplot(4, 1, 2)\n",
" sns.scatterplot(x=df_encoded['city_latitude'], y=df_encoded[city_columns[0]], hue=df_encoded['Cluster'], palette='Set1', alpha=0.6)\n",
2024-12-14 07:16:17 +04:00
" plt.title('city_latitude vs city Clusters')\n",
2024-12-08 22:51:37 +04:00
" plt.xlabel('city_latitude')\n",
2024-12-14 07:16:17 +04:00
" plt.ylabel(f'city ({city_columns[0]})')\n",
2024-12-08 22:51:37 +04:00
"else:\n",
" plt.subplot(4, 1, 2)\n",
2024-12-14 07:16:17 +04:00
" plt.text(0.5, 0.5, 'No city columns found', ha='center', va='center', fontsize=12)\n",
" plt.title('city_latitude vs city Clusters')\n",
2024-12-08 22:51:37 +04:00
"\n",
2024-12-14 07:16:17 +04:00
"# Парный график 3: city_latitude vs state (другая категория)\n",
"if len(state_columns) > 1:\n",
2024-12-08 22:51:37 +04:00
" plt.subplot(4, 1, 3)\n",
2024-12-14 07:16:17 +04:00
" sns.scatterplot(x=df_encoded['city_latitude'], y=df_encoded[state_columns[1]], hue=df_encoded['Cluster'], palette='Set1', alpha=0.6)\n",
" plt.title('city_latitude vs state Clusters')\n",
2024-12-08 22:51:37 +04:00
" plt.xlabel('city_latitude')\n",
2024-12-14 07:16:17 +04:00
" plt.ylabel(f'state ({state_columns[1]})')\n",
2024-12-08 22:51:37 +04:00
"else:\n",
" plt.subplot(4, 1, 3)\n",
2024-12-14 07:16:17 +04:00
" plt.text(0.5, 0.5, 'Not enough state columns found', ha='center', va='center', fontsize=12)\n",
" plt.title('city_latitude vs state Clusters')\n",
2024-12-08 22:51:37 +04:00
"\n",
2024-12-14 07:16:17 +04:00
"# Парный график 4: city_latitude vs city (другая подкатегория)\n",
2024-12-08 22:51:37 +04:00
"if len(city_columns) > 1:\n",
" plt.subplot(4, 1, 4)\n",
" sns.scatterplot(x=df_encoded['city_latitude'], y=df_encoded[city_columns[1]], hue=df_encoded['Cluster'], palette='Set1', alpha=0.6)\n",
2024-12-14 07:16:17 +04:00
" plt.title('city_latitude vs city Clusters')\n",
2024-12-08 22:51:37 +04:00
" plt.xlabel('city_latitude')\n",
2024-12-14 07:16:17 +04:00
" plt.ylabel(f'city ({city_columns[1]})')\n",
2024-12-08 22:51:37 +04:00
"else:\n",
" plt.subplot(4, 1, 4)\n",
2024-12-14 07:16:17 +04:00
" plt.text(0.5, 0.5, 'Not enough city columns found', ha='center', va='center', fontsize=12)\n",
" plt.title('city_latitude vs city Clusters')\n",
2024-12-08 22:51:37 +04:00
"\n",
"# Настройка графиков\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## KMeans (неиерархическая кластеризация) для сравнения"
]
},
{
"cell_type": "code",
2024-12-14 07:16:17 +04:00
"execution_count": 39,
2024-12-08 22:51:37 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Центры кластеров:\n",
" [[48.25734972]\n",
" [32.43819138]\n",
" [37.9835445 ]\n",
" [43.07246158]]\n"
]
},
{
"data": {
2024-12-14 07:16:17 +04:00
"image/png": "iVBORw0KGgoAAAANSUhEUgAABjAAAASgCAYAAABWngGUAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/GU6VOAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3hU1dbH8d/0TBpJIIUeSOgdaXoFARFEsWGlWC+IFBUbqNeCqNeuXAFRUexdFEVRbCjXQhURpEOAUJJQ02aSqe8fvJlLSMAkJDMh+X6ex4fk7D3nrL0ykdmss882+P1+vwAAAAAAAAAAAKoRY6gDAAAAAAAAAAAAOBYFDAAAAAAAAAAAUO1QwAAAAAAAAAAAANUOBQwAAAAAAAAAAFDtUMAAAAAAAAAAAADVDgUMAAAAAAAAAABQ7VDAAAAAAAAAAAAA1Q4FDAAAAAAAAAAAUO1QwACAGsLv94c6hJCpTmP/u1iqU6yhVtFckEMAAIDqpTZ/PqtOY2cuUnHkBqi+KGAAqLCrr75aV199dYnjeXl5uuKKK9S+fXt99913gb6tWrXSVVddddzz3XbbbWrVqpXuvvvuKou5qhQWFur111/XpZdeqtNOO009evTQVVddpXnz5hX7IDR9+nS1atWqUq/tcrn073//W/Pnz6+U8x3v51pdtGrVStOnT5dU+WMvsmvXLrVq1UqffPJJuV73wgsv6NVXXw18f+zPe+XKlbrxxhsrJcZPPvlErVq10q5duyrlfMF2bK7Koqp+3gAA4NTDXOR/mIsED3ORI07lucix+c3JydGkSZO0YsWKEEcG4HgoYACoVHl5eRo1apQ2bNigmTNnasCAAYE2o9GoP/74QxkZGSVe53A4tGjRomCGWmn279+vK6+8UrNmzVK/fv303HPP6cknnwxMgO6///4qvZsjKytLb7zxhjweT6Wc78EHH9SDDz5YKeeqCh988IEuv/xySZU/9pP1n//8R06nM/D95Zdfrg8++CDw/UcffaStW7eGIrRq59hclUV1+3kDAIDqhbkIc5Gqxlzk1JeQkKAPPvhAffv2lSStX79en332mXw+X2gDA3Bc5lAHAKDmKJowrF+/XrNmzdI//vGPYu1t27bVli1b9PXXX+u6664r1rZo0SLZ7XZFR0cHMeLKMXnyZGVkZOiDDz5QcnJy4Hjfvn3VoEEDPfvss+rXr5/OPvvs0AVZDqmpqaEO4YQ6d+4c6hDKLCkpSUlJSaEOAwAAoMZjLsJcJBiYi5z6rFbrKfVzBMAKDACVJD8/X6NHj9bGjRv18ssvl5gwSFJ4eLjOOussff311yXaFixYoEGDBslsLl5X9fl8evnll3XOOeeoffv2GjRokN56661ifbxer15++WUNGTJEHTt2VOfOnXXVVVdpyZIlgT7Tp0/XOeecox9//FEXXHBB4Fzz5s0rdq433nhD5557rjp06KDevXtrypQpysvLO+64169fr59//ln//Oc/i00Yilx33XUaMWKEwsPDS319//79SyxTP3Y5bkFBgaZMmaI+ffqoffv2OvfccwNLg3ft2hWYjNxzzz3q379/4DwrVqzQyJEj1alTJ/Xo0UOTJ0/WwYMHi12nbdu2+uijj/SPf/xDPXr00JYtW0os227VqpXeeecd/etf/1KPHj3UpUsX3Xrrrdq/f3+xuF999VWdffbZ6tixo6666ir98MMPatWqlZYuXRqI9egl18eTlZWlyZMn6/TTT1eXLl00cuRIrVq1qlg806dPL3XsP/74o1q1aqWff/652DlXrFihVq1aaeXKlSe89oksX75c//znP9W9e3e1b99e/fv31/Tp0wN36hQtz54xY0bg66OXbd9999369NNPtXv37sCS5aVLlxbLUZFjfwY+n08vvPCC+vbtq06dOmncuHHKzs4uEeOmTZs0ZswYde3aVV27dtX48eOVnp5+3DHNnz9frVq10qZNm4od/+6779SqVSutW7dOUvl/L3w+n5577jn1798/kKtnnnlGbrf7uLkquu7w4cPVpUuXwHv9nXfekXRy73UAAFCzMRdhLsJchLnI0bZt26YJEyaoR48e6t69u8aMGRNYfXL0I6SWLl2qa665RpJ0zTXX6Oqrr9Y777yjVq1aKS0trdg5P/vsM7Vp00Z79+494bUBVD4KGABOmsPh0I033qh169Zp9uzZ6tmz53H7nnfeeSWWbufl5Wnx4sUaMmRIif5TpkzR888/rwsvvFAvvviizj33XP373//WzJkzA32efvppvfDCC7ryyiv1yiuv6OGHH9bhw4d16623FltCu2/fPk2dOlXXXHONXn75ZTVq1EiTJ08OfJD54osv9NRTT2nEiBF69dVXNX78eH322Wd6+OGHjzue//73v5JU7MP60Ww2mx544AGdfvrpxz3H3/n3v/+txYsXa/LkyYEP5k8++aTmzp2rhIQEzZgxQ5I0duzYwNfLly/Xddddp7CwME2bNk333nuvli1bpmuuuUYFBQWBc3u9Xs2ZM0ePPvqo7rnnHqWkpJQaw3PPPSefz6dnn31WkyZN0qJFi/Tvf/870D5jxgw9/fTTGjx4sF544QV16tRJEydOLHaOoqW6RUuuS5Ofn69hw4Zp6dKluuuuuzRjxgzZbDbdcMMN2r59e4nzHTv23r17KyEhQZ999lmxvvPmzVNycrJOO+20Eyf7ODZs2KDrrrtOMTExeu655zRr1ix169ZNM2bM0FdffSVJgeXZl112WbGl2kXGjRuns846S/Hx8cWWLJfFU089pZkzZ+qyyy7TjBkzFBMTo2eeeaZYn7S0NF111VU6cOCAnnjiCT366KNKT0/XsGHDdODAgVLPO2DAAIWHh+vLL78sdvyLL75QixYt1LZt2wr9XsyePVvvvfeexo8frzlz5mjYsGF69dVXNWvWrOPm6scff9T48ePVrl07vfDCC5o+fboaN26sqVOnavXq1Sf9XgcAADUTcxHmIsxFmIscLTMzU1deeaW2b9+uKVOm6KmnntL+/ft17bXX6vDhw8X6tmvXTg888IAk6YEHHtCDDz6oCy64QDabrdSf4+mnn6769euXNXUAKgmPkAJwUoomDEV3kzgcjhP279u3r+x2e7Gl299++63q1q1b4gNdWlqaPvzwQ91+++2BzcbOPPNMGQwGvfTSSxo+fLhiY2OVlZWl2267rdhdIjabTTfffLM2btwYWB7qdDr16KOPBj7AJycnq1+/fvrpp5+UkpKiZcuWqVGjRhoxYoSMRqN69Oih8PDwUu8uKVJ090WjRo3KnrRyWrZsmf7xj3/o/PPPlyT17NlT4eHhqlu3rqxWq9q0aSNJatKkidq2bStJeuaZZ9SsWTO99NJLMplMkqROnTrp/PPP19y5czVixIjA+W+66aa//QDbsmVLPfbYY4Hv//zzz8Ddaw6HQ7Nnz9aIESN05513Sjryc3I6ncU+PJdlqW7RXUGffvppYFxdu3bVxRdfrOXLlxe7s+x4Y7/kkkv01ltvKT8/XxERESooKNBXX311UhvWbdiwQWeccYaeeuopGY1Hav//+Mc/9MMPP2jp0qU6//zzA2NLSkoqdZxNmjRRXFxcuZcs5+Tk6K233tL111+vCRMmSJJ69+6trKyswKRVOjJxs9vtev311xUZGSlJOv300zVgwAC98sormjx5colz2+12DRo0SAsWLNBtt90m6cjEbdGiRRo/frwkVej3YtmyZWrfvr0uvfRSSVKPHj1kt9sVFRUlSaXmasuWLbrkkkv0r3/9K3CeLl26qGfPnlq6dKk6dep00u91AABQszAXYS7CXIS5yLFef/11uVwuvfbaa4qPj5cktW7dWsOGDdPq1auLFcoiIyMDjy1LTU0NfH3OOefo888/16233iqDwaCMjAwtWbJETz31VJlzB6DysAIDwElZu3atNm/erHfeeUdNmzbV3XffrX379h23f1hYmPr3719s6faXX36pwYMHy2AwFOu7ZMkS+f1+9e/fXx6PJ/Bf//79VVhYGJioPPPMM7r22mt18OBBrVixQnPnztXnn38uSXK5XMXOefSHtaL
2024-12-08 22:51:37 +04:00
"text/plain": [
"<Figure size 1600x1200 with 4 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import pandas as pd\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.cluster import KMeans\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"\n",
"# Загрузка данных\n",
2024-12-14 07:16:17 +04:00
"df = pd.read_csv(\"../../datasets/nuforc_reports.csv\")\n",
2024-12-08 22:51:37 +04:00
"df = df.iloc[:1000].dropna()\n",
"\n",
"# Удаление несущественных столбцов\n",
"columns_to_drop = ['summary', 'stats', 'report_link', 'posted', \"duration\"]\n",
"df_cleaned = df.drop(columns=columns_to_drop)\n",
"\n",
"# Закодирование категориальных переменных\n",
"df_encoded = pd.get_dummies(df_cleaned, drop_first=True)\n",
"\n",
"# Выбор подмножества данных для кластеризации\n",
"features_used = ['city_latitude']\n",
"data_to_scale = df_encoded[features_used]\n",
"\n",
"# Масштабирование данных\n",
"scaler = StandardScaler()\n",
"data_scaled = scaler.fit_transform(data_to_scale)\n",
"\n",
"# Кластеризация данных\n",
"random_state = 42\n",
"kmeans = KMeans(n_clusters=4, random_state=random_state)\n",
"labels = kmeans.fit_predict(data_scaled)\n",
"centers = kmeans.cluster_centers_\n",
"\n",
"# Отображение центроидов\n",
"centers_original = scaler.inverse_transform(centers) # Обратная стандартизация\n",
"print(\"Центры кластеров:\\n\", centers_original)\n",
"\n",
"# Визуализация результатов кластеризации KMeans\n",
"plt.figure(figsize=(16, 12))\n",
"\n",
2024-12-14 07:16:17 +04:00
"# Парный график 1: city_latitude vs state\n",
2024-12-08 22:51:37 +04:00
"plt.subplot(2, 2, 1)\n",
2024-12-14 07:16:17 +04:00
"state_columns = [col for col in df_encoded.columns if col.startswith('state_')]\n",
"if state_columns:\n",
" sns.scatterplot(x=df_cleaned['city_latitude'], y=df_encoded[state_columns[0]], hue=labels, palette='Set1', alpha=0.6)\n",
" plt.title('KMeans Clustering: city_latitude vs state')\n",
2024-12-08 22:51:37 +04:00
" plt.xlabel('city_latitude')\n",
2024-12-14 07:16:17 +04:00
" plt.ylabel(f'state ({state_columns[0]})')\n",
2024-12-08 22:51:37 +04:00
"else:\n",
2024-12-14 07:16:17 +04:00
" plt.title('KMeans Clustering: city_latitude vs state (No Data)')\n",
2024-12-08 22:51:37 +04:00
" plt.xlabel('city_latitude')\n",
" plt.ylabel('state')\n",
"\n",
2024-12-14 07:16:17 +04:00
"# Парный график 2: city_latitude vs city\n",
2024-12-08 22:51:37 +04:00
"plt.subplot(2, 2, 2)\n",
"city_columns = [col for col in df_encoded.columns if col.startswith('city_')]\n",
"if city_columns:\n",
" sns.scatterplot(x=df_cleaned['city_latitude'], y=df_encoded[city_columns[0]], hue=labels, palette='Set1', alpha=0.6)\n",
2024-12-14 07:16:17 +04:00
" plt.title('KMeans Clustering: city_latitude vs city')\n",
2024-12-08 22:51:37 +04:00
" plt.xlabel('city_latitude')\n",
2024-12-14 07:16:17 +04:00
" plt.ylabel(f'city ({city_columns[0]})')\n",
2024-12-08 22:51:37 +04:00
"else:\n",
2024-12-14 07:16:17 +04:00
" plt.title('KMeans Clustering: city_latitude vs city (No Data)')\n",
2024-12-08 22:51:37 +04:00
" plt.xlabel('city_latitude')\n",
2024-12-14 07:16:17 +04:00
" plt.ylabel('city')\n",
2024-12-08 22:51:37 +04:00
"\n",
2024-12-14 07:16:17 +04:00
"# Парный график 3: city_latitude vs state (другая категория)\n",
2024-12-08 22:51:37 +04:00
"plt.subplot(2, 2, 3)\n",
2024-12-14 07:16:17 +04:00
"if len(state_columns) > 1:\n",
" sns.scatterplot(x=df_cleaned['city_latitude'], y=df_encoded[state_columns[1]], hue=labels, palette='Set1', alpha=0.6)\n",
" plt.title('KMeans Clustering: city_latitude vs state')\n",
2024-12-08 22:51:37 +04:00
" plt.xlabel('city_latitude')\n",
2024-12-14 07:16:17 +04:00
" plt.ylabel(f'state ({state_columns[1]})')\n",
2024-12-08 22:51:37 +04:00
"else:\n",
2024-12-14 07:16:17 +04:00
" plt.title('KMeans Clustering: city_latitude vs state (No Data)')\n",
2024-12-08 22:51:37 +04:00
" plt.xlabel('city_latitude')\n",
" plt.ylabel('state')\n",
"\n",
2024-12-14 07:16:17 +04:00
"# Парный график 4: city_latitude vs city (другая подкатегория)\n",
2024-12-08 22:51:37 +04:00
"plt.subplot(2, 2, 4)\n",
"if len(city_columns) > 1:\n",
" sns.scatterplot(x=df_cleaned['city_latitude'], y=df_encoded[city_columns[1]], hue=labels, palette='Set1', alpha=0.6)\n",
2024-12-14 07:16:17 +04:00
" plt.title('KMeans Clustering: city_latitude vs city')\n",
2024-12-08 22:51:37 +04:00
" plt.xlabel('city_latitude')\n",
2024-12-14 07:16:17 +04:00
" plt.ylabel(f'city ({city_columns[1]})')\n",
2024-12-08 22:51:37 +04:00
"else:\n",
2024-12-14 07:16:17 +04:00
" plt.title('KMeans Clustering: city_latitude vs city (No Data)')\n",
2024-12-08 22:51:37 +04:00
" plt.xlabel('city_latitude')\n",
2024-12-14 07:16:17 +04:00
" plt.ylabel('city')\n",
2024-12-08 22:51:37 +04:00
"\n",
"# Настройка графиков\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### PCA для визуализации сокращенной размерности"
]
},
{
"cell_type": "code",
2024-12-14 07:16:17 +04:00
"execution_count": 40,
2024-12-08 22:51:37 +04:00
"metadata": {},
"outputs": [
{
"data": {
2024-12-14 07:16:17 +04:00
"image/png": "iVBORw0KGgoAAAANSUhEUgAABjAAAAJICAYAAADPWa1BAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/GU6VOAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3zU9f0H8Nd33ExCEgKZEAIEwt57KKCiIoqziohIHSi2tlpbtSpi/TnauurAaqvVKhZUFBRRBEVBRfbeI0AYGYyQcfM7fn8cObjkLmTcTF7Px8NHy33u7vu+T+6S7+s+38/nI+i6roOIiIiIiIiIiIiIiCiKiJEugIiIiIiIiIiIiIiIqDoOYBARERERERERERERUdThAAYREREREREREREREUUdDmAQEREREREREREREVHU4QAGERERERERERERERFFHQ5gEBERERERERERERFR1OEABhERERERERERERERRR0OYBARERERERERERERUdThAAYRUTOi63qkS6AAmvPPpjm/diIiIiKqieeHzQN/zkRUFxzAICJMnjwZeXl5Pv/16NEDo0aNwpNPPonTp0/XeEx+fj5mzpyJiy++GL169cKoUaPwwAMPYOfOnQGP89JLLyEvLw9PPfVUKF9OQK+++iry8vIicmx/Pv30U+Tl5eHw4cMhf5zL5cIzzzyDL774or5l1stNN92EvLw8LF68OKTHibafZWOUlZXhT3/6E9auXeu9bfLkyZg8eXLYaqjr53nMmDF4+OGHg3rsPXv2YOLEiUF5rsOHDyMvLw+ffvppUJ6PiIiIogtzS2Q0pdySl5eHV199tcbtu3fvxtChQ3HhhRfiwIED3vvm5eXhxRdf9PtcmqZh5MiRMXv+WVRUhL/97W+47LLL0Lt3b4wYMQJ33323Ty4BQpNNCgsLcdddd+HIkSNBeb5AP1ciaho4gEFEAIBu3bph7ty53v/+85//4LbbbsO8efMwbdo0nysjvvnmG1xzzTXYtm0b7rnnHvzrX//C/fffjwMHDuBXv/oVfvrppxrPr2ka5s+fj86dO2PBggWw2+3hfHnNXnFxMd577z0oihKyY+zfvx8bNmxA586dMWfOnJAdp6nZsWMHFixYAE3TvLc98cQTeOKJJ8Jy/IZ8noPp66+/xoYNG4LyXKmpqZg7dy5GjRoVlOcjIiKi6MPc0rSFI7dUt2fPHtx2222wWCz44IMPkJOT420TRRFff/2138etWbMGxcXFYaoyuNatW4cJEyZg2bJluPXWW/HPf/4Tjz76KBwOByZPnoz58+eH9Pg///wzfvjhh6A939y5c3HDDTcE7fmIKLrIkS6AiKJDfHw8+vTp43PbwIEDUVlZiVdeeQWbNm1Cnz59cOjQITz00EMYOXIkXn75ZUiS5L3/2LFjMXHiRDz00EP47rvvYDQavW0//vgjCgsL8eKLL+KWW27BwoULeYLRxHz66afIysrCtGnT8OCDD+LgwYNo165dpMuKSbm5uWE5TkM/z9HKaDTW+D1GRERETQtzCwXTvn37MGXKFMTFxeG9995DZmamT3u/fv2wdu1abN++Hd26dfNp+/LLL9G1a1fs2LEjnCU3WmlpKX7/+98jJycH//nPf2CxWLxtl156Ke666y7MmDEDI0aMQKtWrSJYad0xAxA1bZyBQUS16tGjBwDg6NGjAID3338fLpcLjz32mE8IAACLxYKHHnoI1113XY3p2/PmzUPnzp3Rv39/DB48GHPnzj3vsceMGYNnnnkGU6ZMQa9evfDoo48C8JxwzZgxA8OGDUPPnj3xq1/9CitXrvR5rNPpxLPPPovhw4ejb9++eOSRR+B0On3u428q7KpVq5CXl4dVq1Z5b9u/fz9+85vfYNCgQRg4cCCmTZuGffv2+Rzrb3/7Gy688EL06NEDV155JRYtWuTzvJqmYdasWRg1ahR69+6N6dOn+53iXl1dH7d06VLcfPPN6Nu3L3r06IHLLrsMs2fPBuBZVueiiy4CADzyyCMYM2aM93Eff/wxrr32WvTp0we9evXChAkT8NVXX/k8d15e3nmXDVJVFfPnz8fo0aNx8cUXw2q1+v0Zu91uPP/887jgggvQq1cv3H777Zg/f36NqeWfffYZxo0bh549e+Kqq67CypUr0a1bt1qnZi9atAjXXnst+vbti+HDh2PGjBk+ffXqq6/isssuw5IlSzB+/Hj07NkTEyZMwIYNG7Bx40bccMMN6NWrF8aPH1/j/bR7925MmzYN/fr1Q79+/XDvvfeioKDA2171vpkzZw5Gjx6Nfv36ea/oq62PV61ahVtvvRUAcOutt3rfj+e+N3/961/j2muvrfF6p0+fjquuusr777Vr1+KWW25B7969MWjQIDz00EM4efJkwP4CGv55Pvc1n/tZqV47AGzduhVTpkxB//790bdvX9x2223YuHEjAM/P5LXXXgPgO+1b0zS89dZbuOSSS9CjRw9ceumleP/992sc58EHH8R9992HPn36YOrUqTWWkPr000/RrVs3bNq0CTfeeCN69uyJ0aNH4+233/Z5ruLiYtx///3ez/iMGTPw0ksv+XxWiIiIKLoxtzC31CW3nGvfvn249dZbkZCQgA8++KDG4AXgGRxr1apVjVkYiqLgm2++wRVXXFHjMXX5uZ88eRJPPvkkRo8ejR49emDQoEG49957fTLR5MmT8eijj+Ktt97CqFGj0LNnT9x0003YvHmz9z4OhwMzZ87EBRdc4O3P6ue61c2fPx/FxcX485//7DN4AXhmnDz44IOYNGkSKioqajw20JKtDz/8sM/P69ChQ7j77rsxePBg9O7dGzfeeKN3xsWnn36KRx55BABw0UUX+fzMPv74Y1xxxRXepeFeffVVqKrqc5wpU6bgiSeeQL9+/TBu3DioquqTJao+GytXrsSvf/1r9O7dG8OHD8ff//53n+eqqKjAjBkzMHToUPTt2xf3338/3n333ahavo2IPDiAQUS1ys/PBwC0bdsWALBixQp069YNaWlpfu8/dOhQ3H///WjdurX3ttLSUnz33Xe4+uqrAQDXXHMNtmzZgm3btp33+LNnz0bPnj0xa9YsXH/99XA6nZgyZQq+/fZb3H///XjttdeQnp6OO+64w+ek8I9//CM++ugjTJs2DS+//DJOnz6Nd999t96vv6ioCDfeeCMOHDiAmTNn4u9//zuOHz+OKVOmoLS0FLqu495778WcOXMwdepUvPHGG96Tn3On3f7973/H66+/juuvvx6vvfYakpKS8MILL5z3+HV53Pfff497770X3bt3x6xZs/Dqq6+ibdu2+Mtf/oJNmzYhNTXV+yXxPffc4/3/s2fPxowZM3DxxRfjzTffxPPPPw+j0YgHH3wQhYWF3uefO3cupk+fXmudy5cvR0lJCa6++mqYzWZcfvnl+Oyzz+ByuXzuN2PGDLz33nu45ZZb8Prrr6NVq1Z4/PHHfe4zf/58PPzww+jXrx9mzZqFSy+9FNOnT/c52axu1qxZeOCBB9CnTx+88soruPfee7F48WJMnjwZDofDe7/CwkI899xzuPvuu/GPf/wDZWVluO+++/DAAw/ghhtuwOuvvw5d13H//fd7H5efn4+bbroJJ06cwF//+lc8/fTTKCgowMSJE3HixAmfOl577TU89NBDmDFjBvr27XvePu7evTtmzJjh7Rt/y0ZdddVV2LZtGw4ePOi9raysDMuXL8eECRMAeKav33bbbTCbzXj55Zfx5z//GatXr8att97q8/qra8jnuT4qKipwxx13IDk5Ga+++ipeeukl2O123H777SgvL8cNN9yA66+/HoDvtO+ZM2filVdewVVXXYV//vOfuOyyy/DMM8/g9ddf93n+r776CnFxcXjjjTdwxx13+K1B0zT8/ve/x7hx4/DWW2+hX79++Nvf/oYVK1YA8KyzPGXKFKxfvx5//vOf8eyzz2Lnzp145513GvSaiYiIKDKYW5hb6pJbquzfvx9TpkxBfHw8Pvjgg4DvE0mScOmll9YYwFi5ciWcTmeNC17q8nPXdR3Tpk3DTz/9hAcffBBvv/0
2024-12-08 22:51:37 +04:00
"text/plain": [
"<Figure size 1600x600 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Применение PCA ТОЛЬКО к числовым данным\n",
"pca = PCA(n_components=1)\n",
"reduced_data = pca.fit_transform(data_scaled)\n",
"\n",
"# Визуализация сокращенных данных\n",
"plt.figure(figsize=(16, 6))\n",
"\n",
"# График 1: PCA reduced data: Agglomerative Clustering\n",
"plt.subplot(1, 2, 1)\n",
"sns.scatterplot(x=range(len(reduced_data)), y=reduced_data[:, 0], hue=result, palette='Set1', alpha=0.6)\n",
"plt.title('PCA reduced data: Agglomerative Clustering')\n",
"plt.xlabel('Sample Index')\n",
"plt.ylabel('Principal Component 1')\n",
"\n",
"# График 2: PCA reduced data: KMeans Clustering\n",
"plt.subplot(1, 2, 2)\n",
"sns.scatterplot(x=range(len(reduced_data)), y=reduced_data[:, 0], hue=labels, palette='Set1', alpha=0.6)\n",
"plt.title('PCA reduced data: KMeans Clustering')\n",
"plt.xlabel('Sample Index')\n",
"plt.ylabel('Principal Component 1')\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Анализ инерции для метода локтя (метод оценки суммы квадратов расстояний)"
]
},
{
"cell_type": "code",
2024-12-14 07:16:17 +04:00
"execution_count": 41,
2024-12-08 22:51:37 +04:00
"metadata": {},
"outputs": [
{
"data": {
2024-12-14 07:16:17 +04:00
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA1YAAAImCAYAAABQCRseAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/GU6VOAAAACXBIWXMAAA9hAAAPYQGoP6dpAAB5nElEQVR4nO3dd3hUVf7H8c/MZNILpIdeQqgpIAFxQRH5sRZ0RSyrYkFQFnFZRUVR1hV3LYsUKaIiiBVBBVGXVddeEBCUnkAIEKSF9EZ6Mr8/QkaGUEIS5iaT9+t58iRz77l3vndy1s2Hc+65JpvNZhMAAAAAoM7MRhcAAAAAAE0dwQoAAAAA6olgBQAAAAD1RLACAAAAgHoiWAEAAABAPRGsAAAAAKCeCFYAAAAAUE8EKwAAAACoJ4IVAAAAANQTwQoAAAAA6olgBQBncdttt6lr167685//fNo2DzzwgLp27apHH33UiZUBqKuDBw+qa9euWrlypdGlAHARBCsAqAWz2azNmzcrNTW1xr7CwkJ98803BlQFAAAaC4IVANRCjx495OHhoc8++6zGvm+++UZeXl4KCwszoDIAANAYEKwAoBa8vb11ySWXnDJY/fe//9Uf//hHubm51dj35Zdf6rrrrlN0dLT+8Ic/6F//+pcKCwslSUOGDFHXrl1P+XXw4EFJ0po1a3TLLbfoggsuUP/+/fXggw/qyJEjDu/x4IMPnvIcZ5viVD3F8VRfJ9q2bZvGjBmj/v37q0+fPvrLX/6i3bt32/evX79eXbt21fr16yVJSUlJGjp0qP785z9r3rx5p32PefPmSZLef/99XXHFFerVq5fD/rNNq3zvvfdOed4Tj6ue7nW2dnWtobafzZne/3T7q38Pjz76qIYMGeLwvsuWLXP4DE98n19++cWh7dtvv62uXbs6nKO4uFgzZ87UsGHD1KtXL/Xp00ejR49WYmKiw7Gnq+u2225zaFNdx6mc3D+q3XbbbQ7nKSkp0YsvvqjLL79c0dHRGjZsmBYuXKjKykqHY06uZf369bU69mxsNpumTJmimJgY/fjjj7U+DgCq1fwrAABwSldeeaXuv/9+paamKjw8XJJUUFCg77//XkuWLNH333/v0P6TTz7RQw89pKuvvlr333+/Dh06pNmzZys5OVlLlizR/PnzVVpaqvT0dN13330aP368Bg8eLEkKDQ3VqlWr9Mgjj2j48OEaN26csrOzNXfuXN1000368MMPFRQUJKnqD9KbbrpJ1113nSTZz1cbPXr00D/+8Q/76/fff18ffPCB/fW6des0duxY9e/fX88884xKSkr0yiuv6M9//rPee+89de7cucY5n3/+efXq1Uvjx49XQECABg0aJEmaNm2aJNnfLzw8XBs2bNDUqVN1/fXXa+rUqfLx8ZGkWtVfXFys6OhoTZ061b7tdMed+Nme3K6uNZzLZ/PEE0+oZ8+ep3z/5cuXS5J27Nihp556qkbbk+Xm5uqFF1445T4fHx99/fXXuuCCC+zb/vvf/8psdvx31MmTJ2vjxo2aNGmS2rVrp/3792vOnDl68MEHtXr1aplMJnvb66+/XjfccIP9dfXvsSHZbDb95S9/0ebNm3XfffepW7duWr9+vV544QUdOHBA//znP+1tT+6znTt3rvWxZ/Kvf/1L//nPf/Tiiy9q4MCBDX6NAFwfwQoAamnw4MHy8vLSZ599pjvvvFOS9MUXXygoKMjhD1mp6g/FGTNmaNCgQZoxY4Z9e4cOHXTnnXfqu+++s/+hXz061a5dO8XFxUmSKisrNWPGDA0cOFAzZ860H9+nTx9deeWVWrx4sSZPnixJKioqUocOHezHVp+vNnx9fe3HSdIPP/zgsH/mzJlq3769Fi5cKIvFIkkaOHCg/u///k9z587VnDlzHNrv379fP/74oz7++GN16dJFkuwh1NfXV5Ic3m/16tWSpMcee8weaCTJ3d39rLUXFRUpODjY4XynO+7Ez/bkdlu3bq1TDefy2URGRp72/au3l5SUnLLtyebOnatWrVopOzu7xr6LL75YX331lR5++GFJUmpqqjZt2qS+ffvq0KFDkqTS0lIdO3ZMU6dO1ZVXXilJ6tevnwoKCvTcc88pIyNDISEh9nOGh4c71FP9e2xI33//vX766SfNmjVLV111lSTpD3/4gzw9PTVnzhzdfvvt9v50cp/97rvvan3s6cycOVPLly/X/PnzdfHFFzf49QFoHpgKCAC15OnpqSFDhjhMB1y9erWuuOIKh3/hl6S9e/cqNTVVQ4YMUXl5uf0rPj5evr6+WrNmzRnfa9++fUpPT9fw4cMdtrdr1069e/fWzz//bN925MgR+fn5NcAVOiosLNS2bdt0xRVX2IODJPn7++vSSy91qKG6/ezZs9W/f/+z/iFbLSYmRpL02muvKS0tTaWlpSovL6/VsQ113XWp4Vw/m4aSlJSk5cuX6+9///sp9w8ZMkQpKSnau3evJOmzzz5TbGysWrdubW/j7u6uxYsX68orr9TRo0e1bt06LVu2zL4AS2lp6TnXVVlZqfLyctlstrO2qf46se3PP/8sNzc3XX755Q7HXHPNNfb9p1OfYyXpnXfe0cKFC3XVVVc5jGoCwLlixAoAzsEVV1yh++67T6mpqfLw8NDatWt1//3312iXk5MjqWra1KmmTqWlpZ3xfaqPDw4OrrEvODhYCQkJkqpGxg4fPqw2bdqc24XUQn5+vmw222lryM/Pd9j2l7/8Rf7+/g5TCc8mPj5eU6dO1cKFCzV//vxzqu/QoUNnnDJ3Pms418+mofzrX//SVVddpd69e59yf1hYmHr16qWvvvpKnTp10n//+18NHz7c3l+q/fDDD3rmmWe0d+9e+fj4qFu3bvL29pakM4aj01mwYIEWLFggi8Wi4OBgDRw4UH/7298cFnSpHuU9Ub9+/SRVTW9s2bKlQ0iVZB85O9PnWZ9jJWnnzp0aOHCg/vOf/+iOO+5Qjx49ztgeAE6HYAUA5+Diiy+Wj4+PPvvsM3l7e6tNmzbq1atXjXb+/v6Squ5lqf7j8UQBAQFnfJ8WLVpIkjIyMmrsS09PV8uWLSVJiYmJKi4urrHgREPw8/OTyWQ6bQ3VNVabPHmyPvvsM02cOFHvvPNOraeM3Xjjjfrxxx9VXl6uJ554Qm3atNH48ePPeExlZaW2bNmikSNH1uo9Th5RrG8N5/rZNIRPP/1U27dvd5gaeiqXXXaZvvrqK11xxRXavn275s+f7xCsfvvtN02YMEFDhw7VK6+8orZt28pkMumdd96pMRVUOvtnJ1V9fjfeeKMqKyt1+PBhzZ49W3fffbc+/vhje5tp06Y5BOET75MKCAhQdna2KioqHAJS9T9AVPf3U6nPsZL0t7/9TbfffruuuuoqTZ06Ve+//36NkAYAtcFUQAA4B+7u7ho6dKg+//xzffrpp/Z7Ok7WqVMnBQUF6eDBg4qOjrZ/hYWFaebMmTVGEE7WsWNHhYSE6D//+Y/D9gMHDmjz5s3q06ePJOnbb79V9+7dFRgYeM7XUllZecY/IL29vdWrVy99+umnqqiosG/Pz8/Xt99+W+O+sl69emn+/Pk6dOiQnn/++VrXMWfOHH377bd67rnndMUVVyg6Ovqs9zf9+uuvKiwsVP/+/c/Yrnr05eTFG+pbw7l+NvVVWlqq6dOna8KECQ73P53K0KFDtWXLFr399tu64IILFBoa6rB/+/btKikp0T333KN27drZg1N1qKr+zKpX1DvbZydVLbYSHR2t2NhYXXHFFbr11lu1a9cu5ebm2tt07NjR4X8LJ97P1q9fP5WXl9dYdbM6mJ3p86zPsVLVCKOnp6eeeOIJ7dixQ0uWLDnr9QLAqTBiBQDn6Morr9S4ceNkNpsdVqQ7kcVi0QMPPKAnnnhCFotFl156qfLy8rRgwQIdPXr0rFPYzGazJk2apClTpujBBx/UNddco+zsbM2fP18BAQEaPXq0duzYoXfeeUdXXXWVNm/ebD82PT1dUtXIRFZWVo3QlZWVpeTkZO3fv98e0E7nwQcf1JgxY3TPPffolltuUVlZmRY
2024-12-08 22:51:37 +04:00
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Список для хранения инерций\n",
"inertias = []\n",
"clusters_range = range(1, 11)\n",
"\n",
"# Вычисление инерции для каждого количества кластеров\n",
"for i in clusters_range:\n",
" kmeans = KMeans(n_clusters=i, random_state=random_state)\n",
" kmeans.fit(data_scaled)\n",
" inertias.append(kmeans.inertia_)\n",
"\n",
"# Визуализация метода локтя\n",
"plt.figure(figsize=(10, 6))\n",
"plt.plot(clusters_range, inertias, marker='o')\n",
"plt.title('Метод локтя для оптимального k')\n",
"plt.xlabel('Количество кластеров')\n",
"plt.ylabel('Инерция')\n",
"plt.grid(True)\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Расчет коэффициентов силуэта"
]
},
{
"cell_type": "code",
2024-12-14 07:16:17 +04:00
"execution_count": 42,
2024-12-08 22:51:37 +04:00
"metadata": {},
"outputs": [
{
"data": {
2024-12-14 07:16:17 +04:00
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA1oAAAImCAYAAABKNfuQAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/GU6VOAAAACXBIWXMAAA9hAAAPYQGoP6dpAADAqElEQVR4nOzdd3iT5foH8G+SJt170xbaUronljJbprgX4kABPR7A40I9R1DUnxNFZXkAQVRQUTlyFERQ9KCI7FGghdI9KNC9d9OR5PdH2khpgaYkfZPm+7muXsL7vnlz57EtufM8z32LVCqVCkRERERERKQzYqEDICIiIiIiGmiYaBEREREREekYEy0iIiIiIiIdY6JFRERERESkY0y0iIiIiIiIdIyJFhERERERkY4x0SIiIiIiItIxJlpEREREREQ6xkSLiIiIiIhIx5hoEZHJmDVrFmbNmtXl2IkTJ3DnnXciNDQU3333nV6f/6WXXsKkSZO0ftykSZPw0ksv6SEiItKXoKAgrF69WugwiEhAZkIHQEQklMrKSvzjH/9AWFgYNmzYgKCgIKFDIiIiogGCiRYRmazPP/8ccrkcH3zwAdzd3YUOh4iIiAYQLh0kIpNUXV2NzZs344477uiWZOXn52P+/PkYO3YsoqOjMWvWLJw8ebLLNX/++SemTZuGqKgojBkzBq+//jrq6+u7XPPNN99g4sSJiIqKwvPPP4+GhgYAwLp16zB69GjExsbi9ddfR2trq+Yxra2tePPNNzFixAiMHDlSs/SosbERCxYsQHR0NMaPH49vvvlG85iCggIEBQVh27ZtmmMtLS2YPHlyl1m6npZOHjt2DEFBQTh27FiPfwfUM3+xsbHdlj1+9913uO222xAeHo4JEyZg9erVUCgUmvM9LZW8NNbO5+rpqzPOay2b7Ok1Xa6srAwvvvgiRo8ejZiYGMycORNJSUma85cv8VKpVHjwwQcRFBSEgoKCLtddLdb58+cjISEBSqWyy/O/8soruOmmmwAAJSUl+Oc//4lRo0YhKioKs2bNQnJyMgBg9erVV3yOzvgyMjLw9NNPY9SoUQgLC0N8fDwWL14MuVx+1TE4dOjQVWPv7WsEgD179uCee+5BVFTUVe91qW3btiEoKAinT5/GPffcg8jISNxxxx349ddfu1xXUFCAhQsXYty4cQgLC8Po0aOxcOFCVFdXa65JT0/Hww8/jJiYGEyZMgXffvut5lxP379A9++Tay3ru/T7btOmTd1+vo4ePYrg4GB89NFHV7zH5VatWoWQkBD88MMPvX4MERk3zmgRkUlRqVQoLi7G4sWL0d7ejscff7zL+ZycHNx///3w9fXFq6++CqlUik2bNuGRRx7Bxo0bERcXh8TERDzxxBO488478a9//QvZ2dn48MMPkZWVha+//hoSiQS///473nrrLcyaNQsJCQnYsmULfv/9dwDArl27sHjxYhQWFmLZsmWwsLDAokWLAABLly7F1q1bsXDhQnh4eGDlypUoLCxEYWEhbr75ZqxatQr79+/HW2+9BQ8PD0yePLnH1/nZZ591SRKux/Lly1FfXw87OzvNsfXr12PlypWYOXMmFi1ahPT0dKxevRrFxcV49913e3XfsLAwbNmyBYA6afv+++81f7exsdFJ7I2NjZgxYwYUCgUWLFgAd3d3bNy4EY899hh++OEH+Pr6dnvMjz/+2CURu9T06dNx3333af7+5ptvdjn3v//9D8eOHcPo0aMBAHK5HL/++ivmzp2L1tZWzJkzB21tbXj99dchlUqxdu1azJo1C//9739x3333IT4+vst9X3/9dQCAh4cHysrK8PDDDyM6OhrvvfceZDIZ9u/fj88//xxubm6YN2/eFcdBLpfDw8MD//73v3uMvbev8cKFC3j22WcRHx+P559/XvM9caV7Xe7xxx/HzJkz8fzzz+P777/Hc889h/Xr12P8+PFobm7G7Nmz4ejoiNdffx22trZISkrCmjVrYGFhgbfeegvNzc2YO3cuvLy8sHr1apw6dQqvv/46Bg0ahISEhF7FoK1Zs2Zh9+7deP/99zFhwgTIZDK8/PLLiI6Oxj/+8Y9e3WPDhg1Yu3YtFi9ejHvuuUcvcRKR4WGiRUQmJTExERMmTIBUKsWnn37a7Y32mjVrIJPJsGnTJs2b/QkTJuD222/HBx98gO+//x7bt2+Hr68vlixZArFYjLFjx8LS0hKvvfYa9u3bh0mTJuHjjz/GyJEj8eqrrwIARo4cibFjx6K+vh5LlixBeHg4AKCurg6ffvopnnzySSiVSmzZsgXz5s3DzJkzAQAuLi544IEH4ODggGXLlkEqlSIhIQFZWVlYv359j4lWcXExPv30U4SFhSE1NfW6xislJQU//vgjQkJCUFdXBwCor6/H2rVr8cADD2he37hx4+Dg4IBXX30Vf/vb3zBs2LBr3tvGxgbR0dEAgAMHDgCA5u+68sMPP6CwsBA//PADQkJCAADDhw/H3XffjcTExG7//xsbG7Fs2bIrjp2Hh0eXGC9NCMeNGwcPDw9s375dk2j99ttvaGpqwt13343k5GTk5eXhm2++QUxMjCaWG2+8EWvXrsXq1avh4eHR5b6XPtfBgwcREhKCf//735rzY8aMwaFDh3Ds2LGrJlrNzc2ws7O7Yuy9fY1paWloa2vD888/j8DAwGve63KzZs3CU089BQCIj4/HPffcg48++gjjx49Hfn4+PDw88P7778PHxwcAMGrUKJw+fRrHjx8HABQWFiIiIgIvv/wyfHx8MG7cOGzevBkHDhzQW6IlEomwZMkS3HnnnVi6dCkkEglqamrw5ZdfQiKRXPPx//nPf7B06VK89dZbmD59ul5iJCLDxKWDRGRSQkND8d5778He3h6LFi3qNutz/PhxTJw4scsbRzMzM9x22204e/YsGhsb8c4772D79u0Qi8Vob29He3s7brrpJojFYiQmJqK9vR1paWkYN26c5h7m5uaIioqCpaWlJskC1G/O5XI5MjMzkZmZiZaWFs2sBqB+o21ubo7IyEhIpdIuj0tNTe2yVK/T+++/j9jYWEycOPG6xkqlUmHx4sWYPn06goODNceTkpIgl8sxadIkzetvb2/XLBM8dOhQl/tces3ly+p6G0dfH3vy5El4e3trkiwAsLS0xP/+978uszad1q5dC0dHR8yYMUPr5xKLxbjnnnuwe/duNDc3A1AnemPGjIGHhwfi4uKQnJyM6OhoKBQKtLe3w87ODmPHjkViYuI17z9u3Dh8/fXXMDc3R05ODvbs2YN169ahqqqqy/LTnhQXF8PW1lbr13S5sLAwmJmZ4euvv0ZhYSFaW1vR3t4OlUrVq8dfOpsjEolw44034syZM5DL5QgJCcHmzZvh5eWF/Px87Nu3Dxs2bEBeXp7m9QUEBGDdunXw8fFBa2sr9u/fj9raWgwdOrTL8yiVyi7fdz3F13lNb2L38fHBCy+8gB9++AHfffcdXn31VU0yeDV79+7Fm2++idjYWNx///3XvJ6IBhbOaBGRSbGxscE999wDf39/zJgxA8899xy2bNmi+WS6trYWLi4u3R7n4uIClUqFhoYGWFtbw9zcHID6jeel6urqUFlZCYVCAUdHxy7nHBwcYG9v3+VY59KriooKTdJ0+ePs7e3h4ODQ7XHt7e1d9q4A6kTx999/x44dO/Dzzz/3ZkiuaPv27cjPz8fHH3+M999/X3O8pqYGAK44g1JWVqb5c2FhYbcx6ksc27dvh0gkgrOzM2644QY8++yz3d5c96SmpgbOzs69ep78/Hx8+eWX+Oyzz1BUVNSnWO+99158/PHH2L17N0aNGoUjR45g2bJlmvMymQyAet/WpXt1ejMzolQqsWLFCnzzzTdoamqCp6cnIiMjNd+LV1NYWAgvL68+vKKufHx8sHTpUqxYsUKzzLNTXFzcNR/v5ubW5e/Ozs5QqVSoq6uDhYUFPv/8c3z88ceoqamBi4sLwsPDYWlp2W3/Y11dHUaMGAEAcHV1xS233NLl/KOPPtrtuS+Pb+3atVi7di0kEglcXFwwbtw4PPvss1csjHP
2024-12-08 22:51:37 +04:00
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Список для хранения коэффициентов силуэта\n",
"silhouette_scores = []\n",
"\n",
"# Вычисление коэффициентов силуэта для каждого количества кластеров\n",
"for i in clusters_range[1:]: \n",
" kmeans = KMeans(n_clusters=i, random_state=random_state)\n",
" labels = kmeans.fit_predict(data_scaled)\n",
" score = silhouette_score(data_scaled, labels)\n",
" silhouette_scores.append(score)\n",
"\n",
"# Построение диаграммы значений силуэта\n",
"plt.figure(figsize=(10, 6))\n",
"plt.plot(clusters_range[1:], silhouette_scores, marker='o')\n",
"plt.title('Коэффициенты силуэта для разных k')\n",
"plt.xlabel('Количество кластеров')\n",
"plt.ylabel('Коэффициент силуэта')\n",
"plt.grid(True)\n",
"plt.show()"
]
},
{
"cell_type": "code",
2024-12-14 07:16:17 +04:00
"execution_count": 43,
2024-12-08 22:51:37 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Средний коэффициент силуэта: 0.544\n"
]
},
{
"data": {
2024-12-14 07:16:17 +04:00
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA04AAAJzCAYAAAA4M0NGAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/GU6VOAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3iT5foH8G/2bJvuvfduaQtl740gOBAE5ChHPE5UXD/3UTkOcAGK4ygqIggIsvfelNECLQW6995Js39/9DQSk3TQpknb+3NdXtL3SfLeyZvx3u/zPPfD0Gq1WhBCCCGEEEIIMYlp6QAIIYQQQgghxNpR4kQIIYQQQggh7aDEiRBCCCGEEELaQYkTIYQQQgghhLSDEidCCCGEEEIIaQclToQQQgghhBDSDkqcCCGEEEIIIaQdlDgRQgghhBBCSDsocSKEEEIIIYSQdlDiRAi5a/Pnz0doaKjef4mJiViwYAHOnz9v6fAIIX1caGgoVq5cabD95s2bGDx4MEaOHInc3FyT91+5ciVCQ0MRHR2NxsZGo7f57bffEBoaijFjxnRX2ISQXooSJ0JIl0RERGDjxo3YuHEj1q9fjw8//BAcDgePPfYYbt26ZenwCCH9zK1bt7Bw4UIIBAKsW7cOfn5+7d5HpVLh8OHDRtt2797dzRESQnorSpwIIV0iFosRFxeHuLg4JCQkYNy4cVi5ciWYTCb++OMPS4dHCOlHsrKy8Mgjj0AkEmHdunXw9vbu0P0GDBiAPXv2GGwvKytDSkoKwsPDuztUQkgvRIkTIaTbCQQC8Hg8MBgM3bb58+dj/vz5erdbsWIFQkND9RKsdevWYezYsYiPj8e8efNw8+ZNAMCvv/6K0NBQ5OTk6D3Gn3/+ifDwcJSUlAAADh48iLlz5yI+Ph5RUVGYNGkSfv31V737vPrqqwZDDFv/Kyws1N3m70NzNmzYYDA0aPfu3ZgyZQri4uIwa9YspKSk6N2nvXjOnTuH0NBQnDt3Tu9+f3+9OvL6KRQKfPTRRxg5ciTCw8P1nldbSezfH/uDDz5AdHQ0jh8/DuCv4UzG/rsz7o689uXl5XjllVcwePBg3TG+fPkyAGDMmDHtHpeUlBTMmzcPsbGxGDhwIF555RVUV1frHv+PP/5AaGgoUlNTMXPmTMTExOCee+7B3r179eJoaGjAf/7zH4wbNw7R0dGYNm0aNm/erHebO+MJCwtDUlISnnnmGdTU1Jh8LQEgOzsbTz/9NAYOHIikpCQsXrwYWVlZJm/f1ut753HLzc3Fs88+i6FDhyIuLg7z58/HxYsXde2FhYW6+23fvl1vH0eOHNG13Wn37t2YNWsW4uPjMXToULz11luoq6sziO1Oxt6LY8aMwauvvmry779rjfXO53fp0iXMnj0b0dHRGDp0KN577z00NzebfIy/y8rKwoIFC2BjY4N169bBw8Ojw/edMmUKTp48aTBcb+/evfD390dYWJjBfQ4ePIhZs2bp4n3//fchlUoNbtORz/+ZM2fw6KOPIjY2FkOHDsUnn3wCtVqtu92pU6fw4IMPIj4+HklJSfjXv/7V5nuKEGIelDgRQrpEq9VCpVJBpVJBqVSioqICK1asgEKhwH333Wfyfvn5+Vi7dq3etv379+O9997D1KlTsXr1aqjVajzxxBNQKBS45557wOPx8Oeff+rdZ9u2bRg8eDDc3d1x9OhRPPXUU4iMjMRXX32FlStXwtvbG//+97+Rmpqqdz9nZ2fdEMONGzfiX//6V5vPs66uDp9//rnetrS0NCxduhRxcXH4+uuv4e7ujieeeAKVlZUA0Kl4OsvY6/fdd9/hp59+wiOPPIKffvoJGzduxKpVqzr1uGlpafjtt9/w+eefIz4+Xq/tztfrrbfe0mvryHNtamrCnDlzcO7cObz00ktYtWoVeDweHn30UeTm5mLVqlV6Mf/rX//S7c/FxQUXLlzAwoULwefz8fnnn+P//u//cP78eSxYsMDgBHvx4sUYO3YsVq1aBX9/fyxZsgTHjh0DADQ3N2Pu3LnYsWMHFi1ahK+++goJCQl4/fXXsWbNGr3HGTlyJDZu3IhffvkFL774Ik6dOoUPPvjA5OtXVlaG2bNnIzc3F++88w4++eQTVFZW4pFHHkFtbW2br/2dr+/fj9vt27cxa9YsFBYW4o033sDy5cvBYDDwyCOPGMwnFIlEBsPOdu/eDSZT/yf/q6++wgsvvIC4uDh8+eWXeOqpp7Bv3z7Mnz+/UwlLdygpKcFjjz0Ge3t7rFq1Cs8++yz+/PNPvPzyyx26f3Z2Nh555BGIxWKsW7cOrq6undr/xIkToVarjb5uU6dONbj9jh078NRTTyEgIACrV6/G008/je3bt+PJJ5+EVqsF0LnP/9KlS5GQkIA1a9Zg2rRp+P7777Fp0yYAQEFBAZ588klERUXh66+/xgcffICcnBw8/vjj0Gg0nXqehJCuYVs6AEJI73bhwgVERkYabH/hhRcQGBho8n7Lli1DcHAwrl+/rttWXV2NuXPn4oUXXgDQ0oPSerU+PDwc48ePx/bt2/Hcc8+BwWCgtLQUZ8+exSeffAKg5eRy5syZeP3113WPGR8fj0GDBuHcuXOIjY3VbedyuYiLi9P9nZ2d3ebz/PLLL+Hh4aHX21BaWoqJEyfi/fffB5PJhJOTE6ZNm4YrV65g3LhxnYqns4y9fmlpaQgLC8Ojjz6q29baU9NRrT1+Y8eONWi78/WSy+V6bR15rlu3bkVRURG2bt2qG/o0YMAA3Hvvvbhw4QIeeOABvZh9fHz09rlixQr4+/vjm2++AYvFAgDExsZi6tSp2LJlCx5++GHdbefPn4+nnnoKADB8+HDMnDkTq1evxsiRI/HHH3/g5s2b2LBhgy45HD58OFQqFb766is89NBDkEgkAAAHBwddDElJSTh9+rTea/53a9euhUKhwI8//ghnZ2cAQFhYGObMmYPU1FSMHDnS5H3vfK5/P26rVq0Cl8vFzz//DLFYDAAYNWoUpk2bho8//livt2zEiBE4ceIEFAoFuFwu5HI5Dh06hKSkJF0PYV1dHb7++ms8+OCDeklwSEgIHn74YYPX09y+++472NvbY/Xq1bpjy2Qy8cYbbyAzM9Og1+tOubm5WLBgASorK6FUKu8qmXByckJSUhL27NmD6dOnAwCKioqQmpqKjz/+GF9//bXutlqtFsuXL8fw4cOxfPly3XY/Pz8sXLgQx44dw6hRozr1+X/ggQd079fBgwfj4MGDOHr0KB566CGkpaWhubkZixcv1iWEbm5uOHToEKRSqe79QAgxP0qcCCFdEhkZiXfffRdAywlFfX09jh8/js8++wxSqRTPP/+8wX2OHz+O06dP47vvvsOCBQt02x966CEAgEajgVQqxf79+8Hn8+Hp6QkAuP/++7Fz506kpKQgKSkJ27Ztg0gkwvjx4wEAixYtAtDSs5GTk4P8/HxcvXoVQEsSdrdu3ryp63VojREAJkyYgAkTJkCr1UIqlWLPnj1gMpnw9/c3azymXr/o6Gh8++232LdvH5KTkyESiTp8EqnVanH58mXs3r3boCerIzryXC9evAgvLy+9+SICgQD79u1r9/FlMhlSU1Px2GOP6Xo5AcDb2xuBgYE4deqU3on+zJkzdf9mMBgYP348Vq5ciebmZpw/fx6enp4GPWrTp0/H5s2b9RKc1n1pNBrcuHEDFy9exJAhQ0zGefHiRcTFxemSJqDlJPfIkSPtPse2nD9/HqNHj9Y7SWaz2bre2aamJt325ORkHD9+HOfOncPw4cNx/PhxiMViJCYm6hKnK1euQKFQYNq0aXr7SUxMhKenJ86fP9/lxKn1tWMymQa9Xa00Gg1UKhVSUlIwbNgwXdIEtCSAQMtr2lbitHPnTkRFReGzzz7Do48+ipdeeglr167V26dardb1BAEt74k79wW0DNd7//330djYCLFYjF27diEyMhK+vr56t8vOzkZpaSkWL16sex8CLYm1WCzGqVOnMGrUqE59/v/+XnRzc9MN+4uNjQWPx8P999+PSZMmYcSIERg0aBBiYmJMviaEEPO
2024-12-08 22:51:37 +04:00
"text/plain": [
"<Figure size 1000x700 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Добавляем индекс строки как дополнительный признак\n",
"data_scaled_with_index = np.hstack((data_scaled, np.arange(data_scaled.shape[0]).reshape(-1, 1)))\n",
"\n",
"# ========================\n",
"# Применение K-Means\n",
"# ========================\n",
"kmeans = KMeans(n_clusters=3, random_state=42) \n",
"df_clusters = kmeans.fit_predict(data_scaled)\n",
"\n",
"# ========================\n",
"# Оценка качества кластеризации\n",
"# ========================\n",
"silhouette_avg = silhouette_score(data_scaled, df_clusters)\n",
"print(f'Средний коэффициент силуэта: {silhouette_avg:.3f}')\n",
"\n",
"# ========================\n",
"# Визуализация кластеров\n",
"# ========================\n",
"pca = PCA(n_components=2)\n",
"df_pca = pca.fit_transform(data_scaled_with_index)\n",
"\n",
"plt.figure(figsize=(10, 7))\n",
"sns.scatterplot(x=df_pca[:, 0], y=df_pca[:, 1], hue=df_clusters, palette='viridis', alpha=0.7)\n",
"plt.title('Визуализация кластеров с помощью K-Means')\n",
"plt.xlabel('Первая компонента PCA')\n",
"plt.ylabel('Вторая компонента PCA')\n",
"plt.legend(title='Кластер', loc='upper right')\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
2024-12-14 07:16:17 +04:00
"Средний коэффициент силуэта, равный 0.544, указывает на хорошую кластеризацию. \n",
2024-12-08 22:51:37 +04:00
"\n",
"Средний коэффициент силуэта (silhouette score) указывает на качество кластеризации, измеряя, насколько хорошо точки внутри одного кластера близки друг к другу по сравнению с точками из других кластеров. Значения коэффициента силуэта находятся в диапазоне от -1 до 1:\n",
"\n",
"1: Указывает на идеально плотные и четко разделенные кластеры. \n",
"0: Указывает на перекрытие кластеров или слабую структуру кластеризации. \n",
"Отрицательные значения: Указывают, что точки в кластере расположены ближе к другому кластеру, чем к своему."
]
},
{
"cell_type": "code",
2024-12-14 07:16:17 +04:00
"execution_count": 45,
2024-12-08 22:51:37 +04:00
"metadata": {},
"outputs": [
{
2024-12-14 07:16:17 +04:00
"name": "stdout",
"output_type": "stream",
"text": [
"Средний коэффициент силуэта (агломеративная кластеризация): 0.542\n"
2024-12-08 22:51:37 +04:00
]
2024-12-14 07:16:17 +04:00
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA04AAAJzCAYAAAA4M0NGAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/GU6VOAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3wUdf748ddnZkt6Qu8dpTcFBUWa9awnV2ygqCh2UbF91VN/llMPywkodkXF7ime3HlnObsIioCCIL13Qkiy2TLz+f0xyZJNdrPpu0nez8eD8zKfLZ+d2Z2Z96e8P0prrRFCCCGEEEIIEZOR6AoIIYQQQgghRLKTwEkIIYQQQggh4pDASQghhBBCCCHikMBJCCGEEEIIIeKQwEkIIYQQQggh4pDASQghhBBCCCHikMBJCCGEEEIIIeKQwEkIIYQQQggh4pDASQghhBBCCCHicCW6AkLUhYkTJ/L9999HbMvMzKRv375cddVVHHHEEQmqmRBCiIbqr3/9KwUFBdxwww2sXLmSa665hm+//RbTNBNdNSFEPZDASTRaffv25c477wTAsiz27dvHa6+9xsUXX8y7777LIYcckuAaCiGEaEgmTZrEhAkTGD58OG63mzvuuEOCJiGaEKW11omuhBC1beLEiQC8/PLLEdsLCwsZMWIE5557LjfffHMiqiaEEKIBCwaDbNy4kRYtWpCTk5Po6ggh6pHMcRJNSmpqKl6vF6VUeNvEiRPDgVaJhx9+mF69evHuu++Gt73yyisce+yxDBkyhAkTJrBq1SoAXn31VXr16sW6desiXuP999+nT58+bNu2DYCPP/6Yc889lyFDhtC/f39OOukkXn311Yjn3HLLLfTq1Svqv82bN4cfM27cuIjnvf766/Tq1YsZM2aEt82fP5+TTz6ZwYMHM378eBYtWhTxnHj1WbBgAb169WLBggURzyu7vyqz/wKBAA8++CCjR4+mT58+EZ+r9D4uq+xr33fffQwYMIAvvvgCgBkzZsTcX6XrXZl9v3PnTm6++WZGjBgRPsaLFy8GYNy4cXGPy6JFi5gwYQKDBg3iiCOO4Oabb2bv3r3h13/33Xfp1asXS5Ys4cwzz2TgwIGcdtpp/Pvf/46ox4EDB/jrX//Kcccdx4ABAzj11FN5++23Ix5Tuj69e/dm2LBhXH311ezbty/mvgRYu3ZteKjqsGHDmDJlCmvWrIn5+Ir2b+njtn79eq655hqOPvpoBg8ezMSJE/nhhx/C5Zs3bw4/b968eRHv8dlnn4XLSps/fz7jx49nyJAhHH300fzlL39h//795epWWrTv4rhx47jlllti/l1WSV1Lf74ff/yRs846iwEDBnD00Udzzz33UFRUFPM1Sl7npptuYuTIkfTr148RI0Zw0003RRyjaN+rzZs3V/p7vXPnTm699VZGjx7NwIED+eMf/8gnn3wSUY+S5z355JMR21etWlXuOwy19z2u6POX/j6U/VdybqvMeaWkLiX/+vfvz4knnhjxHYv2PSnZL6XPl5XdlzNmzMDtdtOjRw+ys7M5++yzy+3Dit6roKCAiRMn0rdvX/x+f/izxtofJSzL4umnn+bUU09l4MCBDB48mLPPPpvvvvsu4r1++uknLrroIg477DCGDx/O9ddfz44dOyq1zwHeeustTjnlFPr378+YMWOYMWMGlmWFy2+55RYmTpzI22+/zdixYxkyZAgXXHABv/76a/gxJcel9D757bff6NevX8QxXbFiBeeddx5DhgzhuOOO4/XXX4/4LL/++itXXXUVw4cPp1+/fhxzzDHce++9Eb+9sscRyh/zaN+BL7/8kl69eoXPBdF+936/n2OPPTbq90c0XRI4iUZLa00oFCIUChEMBtm1axcPP/wwgUCAP/zhDzGft3HjRl588cWIbf/5z3+45557OOWUU5g1axaWZXHZZZcRCAQ47bTT8Hq9vP/++xHPee+99xgxYgTt2rXjf//7H1deeSX9+vXjiSeeYMaMGXTq1In/9//+H0uWLIl4XqtWrXjjjTfC/y6//PIKP+f+/ft57LHHIrYtXbqUadOmMXjwYJ588knatWvHZZddxu7duwGqVJ+qirb/nnnmGV566SUuuOACXnrpJd544w1mzpxZpdddunQpr732Go899hhDhgyJKCu9v/7yl79ElFXmsxYUFHDOOeewYMECbrzxRmbOnInX6+Wiiy5i/fr1zJw5M6LOl19+efj9WrduzcKFC5k0aRIpKSk89thj/N///R/ff/89559/frkb7ClTpnDssccyc+ZMunXrxtSpU/n8888BKCoq4txzz+WDDz5g8uTJPPHEExx++OHcdtttzJ49O+J1Ro8ezRtvvMHLL7/MDTfcwNdff819990Xc//t2LGDs846i/Xr13PXXXfxt7/9jd27d3PBBReQm5tb4b4vvX/LHrfVq1czfvx4Nm/ezO2338706dNRSnHBBReUm2eYnp7Op59+GrFt/vz5GEbkpeiJJ57g+uuvZ/DgwTz++ONceeWVfPTRR0ycODFuwFLbtm3bxsUXX0yzZs2YOXMm11xzDe+//z433XRTzOf4fD7OP/981qxZw5133slzzz3H+eefz4cffsijjz4a8diS41j6+1Siou/17t27+eMf/8iiRYu47rrrmDFjBh06dODKK68sF5xWdr/X1vc43udv3bp1ufNb2d9YWdHOKyVKnjtr1iy6d+/OzTffXK4hqyJV2Zelvf/+++HGlcqaO3cuu3fv5qWXXsLj8YS39+3bN+J4//GPf4x43vTp03niiSc466yzePbZZ7nnnnvIzc3l2muvxefzAbB8+XImTJiA3+/noYce4u677+bnn3/m4osvrtQ+f+qpp7jjjjsYMWIEs2fP5rzzzuOZZ57hjjvuiKjLihUrePTRR7nqqqv429/+xr59+5gwYQI7d+6M+bnvu+8+QqFQ+G+fz8cll1xCKBRixowZnH766dx5553hRrGdO3dy3nnn4fP5eOCBB3jmmWc45ZRTePnll5kzZ06V9nlZwWCQ+++/P+7jnn322QoDYtE0yRwn0WgtXLiQfv36ldt+/fXX06NHj5jPu//++znkkEP45Zdfwtv27t3Lueeey/XXXw84PSglrfV9+vTh+OOPZ968eVx77bUopdi+fTvfffcdf/vb3wDn5vLMM8/ktttuC7/mkCFDOPLII1mwYAGDBg0Kb/d4PAwePDj899q1ayv8nI8//jjt27ePaMnevn07J554Ivfeey+GYdCyZUtOPfVUfvrpJ4477rgq1aeqou2/pUuX0rt3by666KLwtqpekEp6/I499thyZaX3V0krbonKfNZ//OMfbNmyhX/84x/06dMHgMMOO4zf//73LFy4kD/96U8Rde7cuXPEez788MN069aNp556KjzfYdCgQZxyyim88847nHfeeeHHTpw4kSuvvBKAY445hjPPPJNZs2YxevRo3n33XVatWsXrr78eDg6POeYYQqEQTzzxBGeffXZ4aFDz5s3DdRg2bBjffPNNxD4v68UXXyQQCPDCCy/QqlUrAHr37s0555zDkiVLGD16dMznlv6sZY/bzJkz8Xg8zJkzh4yMDADGjBnDqaeeykMPPRTRWzZq1Ci+/PJLAoEAHo8Hv9/PJ598wrBhw8I9Kfv37+fJJ5/kz3/+c0SwcOihh3LeeeeV25917ZlnnqFZs2bMmjUrfGwNw+D2229n5cqVUVuj169fT9u2bXnwwQfp1KkTAMOHD2fJkiXlgsnSx7Gsir7XL7zwAnv37uWjjz6iQ4cOgBOETZo0iYceeohTTz01HBiNGjWKf//73+zcuTMcmP3rX/+K2O9Qe9/jeJ+/9Dmu5PzWp08fOnbsGHU/QPTzSonSz23Xrh2ffvopK1asoFu3bjFfr7r7skRBQQHTp0+nX79+Ff7uSrMsKzzPdtiwYRFlGRkZEcf7yy+/jCjfuXMn1113XUSPjdfr5eqrr2blypUMHjyY2bNnk5OTw/PPP4/X6wWgdevW3HDDDaxZs6bCfX7gwIFwYHb77bcDMHLkSHJycrj99tu58MILw/OCDxw
"text/plain": [
"<Figure size 1000x700 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
2024-12-08 22:51:37 +04:00
}
],
"source": [
2024-12-14 07:16:17 +04:00
"from sklearn.cluster import AgglomerativeClustering\n",
"\n",
2024-12-08 22:51:37 +04:00
"# Добавляем индекс строки как дополнительный признак\n",
"data_scaled_with_index = np.hstack((data_scaled, np.arange(data_scaled.shape[0]).reshape(-1, 1)))\n",
"\n",
"# ========================\n",
"# Агломеративная кластеризация\n",
"# ========================\n",
"agg_cluster = AgglomerativeClustering(n_clusters=3) \n",
"labels_agg = agg_cluster.fit_predict(data_scaled)\n",
"\n",
"# ========================\n",
"# Оценка качества кластеризации\n",
"# ========================\n",
"silhouette_avg_agg = silhouette_score(data_scaled, labels_agg)\n",
"print(f'Средний коэффициент силуэта (агломеративная кластеризация): {silhouette_avg_agg:.3f}')\n",
"\n",
"# ========================\n",
"# Визуализация кластеров\n",
"# ========================\n",
"pca = PCA(n_components=2)\n",
"df_pca = pca.fit_transform(data_scaled_with_index)\n",
"\n",
"plt.figure(figsize=(10, 7))\n",
"sns.scatterplot(x=df_pca[:, 0], y=df_pca[:, 1], hue=labels_agg, palette='viridis', alpha=0.7)\n",
"plt.title('Визуализация кластеров с помощью агломеративной кластеризации')\n",
"plt.xlabel('Первая компонента PCA')\n",
"plt.ylabel('Вторая компонента PCA')\n",
"plt.legend(title='Кластер', loc='upper right')\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Значение коэффициента силуэта лежит в диапазоне от -1 до 1. Ближе к 1: Хорошо сформированные, плотные кластеры, четко отделенные друг от друга. \n",
"\n",
"Ближе к 0: Кластеры пересекаются или слабо разделены, не имеют четких границ. Точки расположены одинаково близко как к своему кластеру, так и к соседним. \n",
"Ближе к -1 (Отрицательные значения): Некоторые точки скорее относятся к другим кластерам, чем к текущему (ближе к центрам других кластеров). Очень плохая кластеризация. \n",
"Ближе к 1: Все точки внутри каждого кластера плотно сгруппированы и значительно удалены от точек других кластеров. Свидетельствует о четкой и хорошо разделенной структуре данных. Единица говорит об идеальной кластеризации.\n",
"\n",
2024-12-14 07:16:17 +04:00
"Средний коэффициент силуэта, равный 0.542, указывает на то, что кластеры имеют хорошее разделение и четкие границы. Точки внутри каждого кластера достаточно плотно сгруппированы и значительно удалены от точек других кластеров, что свидетельствует о четкой и хорошо разделенной структуре данных."
2024-12-08 22:51:37 +04:00
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
2024-12-14 07:16:17 +04:00
"version": "3.12.8"
2024-12-08 22:51:37 +04:00
}
},
"nbformat": 4,
"nbformat_minor": 2
}