1254 lines
3.2 MiB
Plaintext
1254 lines
3.2 MiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# Лабораторная работа №5\n",
|
|||
|
"\n",
|
|||
|
"*Вариант задания:* "
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 44,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Index(['summary', 'city', 'state', 'date_time', 'shape', 'duration', 'stats',\n",
|
|||
|
" 'report_link', 'text', 'posted', 'city_latitude', 'city_longitude'],\n",
|
|||
|
" dtype='object')\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"import numpy as np\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"from scipy.cluster.hierarchy import dendrogram, linkage, fcluster\n",
|
|||
|
"from sklearn.cluster import KMeans\n",
|
|||
|
"from sklearn.decomposition import PCA\n",
|
|||
|
"from sklearn.preprocessing import StandardScaler\n",
|
|||
|
"from sklearn.metrics import silhouette_score\n",
|
|||
|
"\n",
|
|||
|
"df = pd.read_csv(\"nuforc_reports.csv\")\n",
|
|||
|
"df = df.iloc[:1000].dropna()\n",
|
|||
|
"print(df.columns)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 45,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>summary</th>\n",
|
|||
|
" <th>city</th>\n",
|
|||
|
" <th>state</th>\n",
|
|||
|
" <th>date_time</th>\n",
|
|||
|
" <th>shape</th>\n",
|
|||
|
" <th>duration</th>\n",
|
|||
|
" <th>stats</th>\n",
|
|||
|
" <th>report_link</th>\n",
|
|||
|
" <th>text</th>\n",
|
|||
|
" <th>posted</th>\n",
|
|||
|
" <th>city_latitude</th>\n",
|
|||
|
" <th>city_longitude</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>Viewed some red lights in the sky appearing to...</td>\n",
|
|||
|
" <td>Visalia</td>\n",
|
|||
|
" <td>CA</td>\n",
|
|||
|
" <td>2021-12-15T21:45:00</td>\n",
|
|||
|
" <td>light</td>\n",
|
|||
|
" <td>2 minutes</td>\n",
|
|||
|
" <td>Occurred : 12/15/2021 21:45 (Entered as : 12/...</td>\n",
|
|||
|
" <td>http://www.nuforc.org/webreports/165/S165881.html</td>\n",
|
|||
|
" <td>Viewed some red lights in the sky appearing to...</td>\n",
|
|||
|
" <td>2021-12-19T00:00:00</td>\n",
|
|||
|
" <td>36.356650</td>\n",
|
|||
|
" <td>-119.347937</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>Look like 1 or 3 crafts from North traveling s...</td>\n",
|
|||
|
" <td>Cincinnati</td>\n",
|
|||
|
" <td>OH</td>\n",
|
|||
|
" <td>2021-12-16T09:45:00</td>\n",
|
|||
|
" <td>triangle</td>\n",
|
|||
|
" <td>14 seconds</td>\n",
|
|||
|
" <td>Occurred : 12/16/2021 09:45 (Entered as : 12/...</td>\n",
|
|||
|
" <td>http://www.nuforc.org/webreports/165/S165888.html</td>\n",
|
|||
|
" <td>Look like 1 or 3 crafts from North traveling s...</td>\n",
|
|||
|
" <td>2021-12-19T00:00:00</td>\n",
|
|||
|
" <td>39.174503</td>\n",
|
|||
|
" <td>-84.481363</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>One red light moving switly west to east, beco...</td>\n",
|
|||
|
" <td>Knoxville</td>\n",
|
|||
|
" <td>TN</td>\n",
|
|||
|
" <td>2021-12-10T19:30:00</td>\n",
|
|||
|
" <td>triangle</td>\n",
|
|||
|
" <td>20-30 seconds</td>\n",
|
|||
|
" <td>Occurred : 12/10/2021 19:30 (Entered as : 12/...</td>\n",
|
|||
|
" <td>http://www.nuforc.org/webreports/165/S165825.html</td>\n",
|
|||
|
" <td>One red light moving switly west to east, beco...</td>\n",
|
|||
|
" <td>2021-12-19T00:00:00</td>\n",
|
|||
|
" <td>35.961561</td>\n",
|
|||
|
" <td>-83.980115</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5</th>\n",
|
|||
|
" <td>I'm familiar with all the fakery and UFO sight...</td>\n",
|
|||
|
" <td>Fullerton</td>\n",
|
|||
|
" <td>CA</td>\n",
|
|||
|
" <td>2020-07-07T23:00:00</td>\n",
|
|||
|
" <td>unknown</td>\n",
|
|||
|
" <td>2 minutes</td>\n",
|
|||
|
" <td>Occurred : 7/7/2020 23:00 (Entered as : 07/07...</td>\n",
|
|||
|
" <td>http://www.nuforc.org/webreports/157/S157444.html</td>\n",
|
|||
|
" <td>I'm familiar with all the fakery and UFO sight...</td>\n",
|
|||
|
" <td>2020-07-09T00:00:00</td>\n",
|
|||
|
" <td>33.877422</td>\n",
|
|||
|
" <td>-117.924978</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>6</th>\n",
|
|||
|
" <td>I was driving up lakes mead towards the lake a...</td>\n",
|
|||
|
" <td>Las Vegas</td>\n",
|
|||
|
" <td>NV</td>\n",
|
|||
|
" <td>2020-04-23T03:00:00</td>\n",
|
|||
|
" <td>oval</td>\n",
|
|||
|
" <td>10 minutes</td>\n",
|
|||
|
" <td>Occurred : 4/23/2020 03:00 (Entered as : 4/23...</td>\n",
|
|||
|
" <td>http://www.nuforc.org/webreports/155/S155608.html</td>\n",
|
|||
|
" <td>I was driving up lakes mead towards the lake a...</td>\n",
|
|||
|
" <td>2020-05-01T00:00:00</td>\n",
|
|||
|
" <td>36.141246</td>\n",
|
|||
|
" <td>-115.186592</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" summary city state \\\n",
|
|||
|
"0 Viewed some red lights in the sky appearing to... Visalia CA \n",
|
|||
|
"1 Look like 1 or 3 crafts from North traveling s... Cincinnati OH \n",
|
|||
|
"3 One red light moving switly west to east, beco... Knoxville TN \n",
|
|||
|
"5 I'm familiar with all the fakery and UFO sight... Fullerton CA \n",
|
|||
|
"6 I was driving up lakes mead towards the lake a... Las Vegas NV \n",
|
|||
|
"\n",
|
|||
|
" date_time shape duration \\\n",
|
|||
|
"0 2021-12-15T21:45:00 light 2 minutes \n",
|
|||
|
"1 2021-12-16T09:45:00 triangle 14 seconds \n",
|
|||
|
"3 2021-12-10T19:30:00 triangle 20-30 seconds \n",
|
|||
|
"5 2020-07-07T23:00:00 unknown 2 minutes \n",
|
|||
|
"6 2020-04-23T03:00:00 oval 10 minutes \n",
|
|||
|
"\n",
|
|||
|
" stats \\\n",
|
|||
|
"0 Occurred : 12/15/2021 21:45 (Entered as : 12/... \n",
|
|||
|
"1 Occurred : 12/16/2021 09:45 (Entered as : 12/... \n",
|
|||
|
"3 Occurred : 12/10/2021 19:30 (Entered as : 12/... \n",
|
|||
|
"5 Occurred : 7/7/2020 23:00 (Entered as : 07/07... \n",
|
|||
|
"6 Occurred : 4/23/2020 03:00 (Entered as : 4/23... \n",
|
|||
|
"\n",
|
|||
|
" report_link \\\n",
|
|||
|
"0 http://www.nuforc.org/webreports/165/S165881.html \n",
|
|||
|
"1 http://www.nuforc.org/webreports/165/S165888.html \n",
|
|||
|
"3 http://www.nuforc.org/webreports/165/S165825.html \n",
|
|||
|
"5 http://www.nuforc.org/webreports/157/S157444.html \n",
|
|||
|
"6 http://www.nuforc.org/webreports/155/S155608.html \n",
|
|||
|
"\n",
|
|||
|
" text posted \\\n",
|
|||
|
"0 Viewed some red lights in the sky appearing to... 2021-12-19T00:00:00 \n",
|
|||
|
"1 Look like 1 or 3 crafts from North traveling s... 2021-12-19T00:00:00 \n",
|
|||
|
"3 One red light moving switly west to east, beco... 2021-12-19T00:00:00 \n",
|
|||
|
"5 I'm familiar with all the fakery and UFO sight... 2020-07-09T00:00:00 \n",
|
|||
|
"6 I was driving up lakes mead towards the lake a... 2020-05-01T00:00:00 \n",
|
|||
|
"\n",
|
|||
|
" city_latitude city_longitude \n",
|
|||
|
"0 36.356650 -119.347937 \n",
|
|||
|
"1 39.174503 -84.481363 \n",
|
|||
|
"3 35.961561 -83.980115 \n",
|
|||
|
"5 33.877422 -117.924978 \n",
|
|||
|
"6 36.141246 -115.186592 "
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 45,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"df.head()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 46,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>city_latitude</th>\n",
|
|||
|
" <th>city_longitude</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>count</th>\n",
|
|||
|
" <td>712.000000</td>\n",
|
|||
|
" <td>712.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>mean</th>\n",
|
|||
|
" <td>39.350240</td>\n",
|
|||
|
" <td>-97.057660</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>std</th>\n",
|
|||
|
" <td>5.558375</td>\n",
|
|||
|
" <td>17.807918</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>min</th>\n",
|
|||
|
" <td>25.774143</td>\n",
|
|||
|
" <td>-149.336500</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>25%</th>\n",
|
|||
|
" <td>34.950725</td>\n",
|
|||
|
" <td>-116.385628</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>50%</th>\n",
|
|||
|
" <td>39.597100</td>\n",
|
|||
|
" <td>-93.326900</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>75%</th>\n",
|
|||
|
" <td>43.262550</td>\n",
|
|||
|
" <td>-82.476700</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>max</th>\n",
|
|||
|
" <td>61.214900</td>\n",
|
|||
|
" <td>-61.260300</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" city_latitude city_longitude\n",
|
|||
|
"count 712.000000 712.000000\n",
|
|||
|
"mean 39.350240 -97.057660\n",
|
|||
|
"std 5.558375 17.807918\n",
|
|||
|
"min 25.774143 -149.336500\n",
|
|||
|
"25% 34.950725 -116.385628\n",
|
|||
|
"50% 39.597100 -93.326900\n",
|
|||
|
"75% 43.262550 -82.476700\n",
|
|||
|
"max 61.214900 -61.260300"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 46,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"df.describe()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 47,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"summary 0\n",
|
|||
|
"city 0\n",
|
|||
|
"state 0\n",
|
|||
|
"date_time 0\n",
|
|||
|
"shape 0\n",
|
|||
|
"duration 0\n",
|
|||
|
"stats 0\n",
|
|||
|
"report_link 0\n",
|
|||
|
"text 0\n",
|
|||
|
"posted 0\n",
|
|||
|
"city_latitude 0\n",
|
|||
|
"city_longitude 0\n",
|
|||
|
"dtype: int64\n",
|
|||
|
"summary False\n",
|
|||
|
"city False\n",
|
|||
|
"state False\n",
|
|||
|
"date_time False\n",
|
|||
|
"shape False\n",
|
|||
|
"duration False\n",
|
|||
|
"stats False\n",
|
|||
|
"report_link False\n",
|
|||
|
"text False\n",
|
|||
|
"posted False\n",
|
|||
|
"city_latitude False\n",
|
|||
|
"city_longitude False\n",
|
|||
|
"dtype: bool\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Процент пропущенных значений признаков\n",
|
|||
|
"for i in df.columns:\n",
|
|||
|
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
|
|||
|
" if null_rate > 0:\n",
|
|||
|
" print(f'{i} Процент пустых значений: %{null_rate:.2f}')\n",
|
|||
|
"\n",
|
|||
|
"print(df.isnull().sum())\n",
|
|||
|
"\n",
|
|||
|
"print(df.isnull().any())"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 48,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"summary object\n",
|
|||
|
"city object\n",
|
|||
|
"state object\n",
|
|||
|
"date_time object\n",
|
|||
|
"shape object\n",
|
|||
|
"duration object\n",
|
|||
|
"stats object\n",
|
|||
|
"report_link object\n",
|
|||
|
"text object\n",
|
|||
|
"posted object\n",
|
|||
|
"city_latitude float64\n",
|
|||
|
"city_longitude float64\n",
|
|||
|
"dtype: object"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 48,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Проверка типов столбцов\n",
|
|||
|
"df.dtypes"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# Атрибуты \n",
|
|||
|
"\n",
|
|||
|
"city_latitude:\n",
|
|||
|
"\n",
|
|||
|
"Пример Цели(нужно сделать):\n",
|
|||
|
"Оптимизация стратегий ценообразования и маркетинга для розничных компаний, стремящихся привлечь покупателей с различными предпочтениями.\n",
|
|||
|
"Кластеризация товаров на основе их характеристик (категория, подкатегория, цена) для выявления групп с похожими профилями."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Очистка данных"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Цель: Упростить набор данных, удалив несущественные столбцы, чтобы сосредоточиться на ключевых атрибутах, которые будут использоваться для кластеризации и анализа.\n",
|
|||
|
"\n",
|
|||
|
"Столбцы (такието) несущественны для анализа, они не содержат ценной информации для решения задачи."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 49,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
" city state date_time shape posted \\\n",
|
|||
|
"0 Visalia CA 2021-12-15T21:45:00 light 2021-12-19T00:00:00 \n",
|
|||
|
"1 Cincinnati OH 2021-12-16T09:45:00 triangle 2021-12-19T00:00:00 \n",
|
|||
|
"3 Knoxville TN 2021-12-10T19:30:00 triangle 2021-12-19T00:00:00 \n",
|
|||
|
"5 Fullerton CA 2020-07-07T23:00:00 unknown 2020-07-09T00:00:00 \n",
|
|||
|
"6 Las Vegas NV 2020-04-23T03:00:00 oval 2020-05-01T00:00:00 \n",
|
|||
|
"\n",
|
|||
|
" city_latitude city_longitude \n",
|
|||
|
"0 36.356650 -119.347937 \n",
|
|||
|
"1 39.174503 -84.481363 \n",
|
|||
|
"3 35.961561 -83.980115 \n",
|
|||
|
"5 33.877422 -117.924978 \n",
|
|||
|
"6 36.141246 -115.186592 \n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Удаление несущественных столбцов\n",
|
|||
|
"columns_to_drop = [ \"summary\", \"stats\", \"report_link\", \"duration\", \"text\"] # Столбцы, которые можно удалить\n",
|
|||
|
"#\"date_time\", \"posted\", \"city\", \"state\",\n",
|
|||
|
"df_cleaned = df.drop(columns=columns_to_drop)\n",
|
|||
|
"\n",
|
|||
|
"print(df_cleaned.head()) # Вывод очищенного DataFrame"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Визуализация парных взаимосвязей\n",
|
|||
|
"Визуализировать ключевые атрибуты миллиардеров для выявления закономерностей и связей между ними."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 50,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"C:\\Users\\tumvu\\AppData\\Local\\Temp\\ipykernel_31852\\2150760320.py:35: UserWarning: Tight layout not applied. tight_layout cannot make Axes height small enough to accommodate all Axes decorations.\n",
|
|||
|
" plt.tight_layout()\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAABSUAADg2CAYAAACqBxdBAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdeXiU1f3+8fuZyWSyT8ISICCJSVjCDgqi1h2lLrh2c4HWXbRWtK373hZrra11aS1W0VKtS+sPivbbRcVaNxBFhBKWBALEBMISJvtkm98fx0kICcpMJvNMJu/XdXGFnJN8OIFh5pn7OYvl9/v9AgAAAAAAAIAIcdg9AAAAAAAAAAB9C6EkAAAAAAAAgIgilAQAAAAAAAAQUYSSAAAAAAAAACKKUBIAAAAAAABARBFKAgAAAAAAAIgoQkkAAAAAAAAAEUUoCQAAAAAAACCiCCUBAACigN/vt3sIMSea/k6/aizRNFYAAIBIIJQEAAAIwZo1a/TjH/9YJ554oiZMmKAZM2borrvu0vbt24OutWnTJl144YU9MMq+obS0VKNGjdKrr74qSaqqqtLNN9+slStXhvXPWb58uUaNGqXly5cf8vc0NjZq/vz5Wrp0aVvbrbfeqpNPPrnt8zfffFO33HJLWMb42GOPadSoUWGpBQAA0JMIJQEAAIL0/PPP6zvf+Y727NmjH/7wh3rqqad01VVXacWKFfrGN76h9evXB1XvH//4h1atWtVDo419mZmZeumll3TiiSdKkgoLC7VkyRK1trbaOzBJFRUVeu6559Tc3NzWdu211+rxxx9v+/zZZ59VeXm5HcMDAACwTZzdAwAAAOhNPv74Y/3sZz/TxRdfrDvuuKOt/aijjtKMGTN07rnn6vbbb2+btYeeFx8fr0mTJtk9jEM2fPhwu4cAAABgO2ZKAgAABOHpp59Wamqqbrrppk59/fr106233qpTTjlFdXV1kqSGhgY9/PDDOu200zRu3DhNmTJFl156qQoLCyWZ5baBWXOjRo3SY489JklqbW3VggULdOqpp2rcuHGaOXOmFi1a1OV4TjnlFE2YMEHf+c539NZbb3VaYrxmzRpdfvnlOuqoozRlyhRdc8012rRpU1t/YFnyiy++qJNOOklTpkzRG2+8oVGjRundd9/t8OetXLlSo0aN0scff9xpLEuXLtWoUaO0cePGDu2BWuvWrZMkPffcc/r617+u8ePH67jjjtO9996rmpqaL/1737x5s77//e9r2rRpmjp1qq6++moVFxdL6rh8e/ny5ZozZ44kac6cOZo9e7aef/55jRo1Slu2bOlQc8mSJSooKOjWLMU33nhDF110kSZPnqxx48bp61//up5//vm2cZ1yyimSpNtuu61tyfb+y7dnz56tFStWaMWKFW3/bq+++qpGjRql0tLSDn/WySefrFtvvbXtc5/PpwceeEDHHnusJk+erNtuu00+n6/TGFeuXKlLLrlEEydO1LRp03TLLbdo7969If/MAAAA4UAoCQAAcIj8fr/effddHX300UpMTOzya8444wxdd911SkpKkiTdfPPN+utf/6qrrrpKzzzzjG677TZt2rRJP/zhD+X3+/XNb35T3/jGNyRJL730kr75zW9Kku699149+uijOvvss/Xkk0/q61//uubPn68nnnii7c96/PHH9ctf/lKnn366fvvb32rixImaN29eh/F8+OGHbftVzp8/Xz/96U9VXl6u73znO22h3v71brnlFt1999065phjlJmZqSVLlnT4msWLFysnJ0dHHHFEp599xowZSkpK0uuvv96h/bXXXtOIESM0ZswYvfbaa3rooYd08cUX6+mnn9Z1112nJUuW6Cc/+clB/9537typb3/72yopKdG9996rhx56SLt379Z3v/td7du3r8PXjh07Vnfffbck6e6779Y999yjWbNmye12d/mzHH300RoyZMhB/+wv8/bbb+u6667T2LFj9dvf/laPPfaYDjvsMN1///1avXq1MjMz2wLnuXPndliyHXDPPfdozJgxGjNmjF566SWNHTv2kP/8H//4x3r55Zd19dVX65FHHpHX69Wzzz7b4Ws++ugjfe9731NCQoIeeeQR3X777VqxYoXmzJmjhoaGkH5uAACAcGD5NgAAwCGqrKyUz+fTsGHDDunrGxsbVVtbqzvvvFNnnHGGJGnatGmqqanRz3/+c+3evVuDBw/W4MGDJaltCfKWLVv08ssv66abbtJVV10lSfra174my7L0+9//XhdddJHcbreeeuopXXzxxfrRj37U9jX19fV66aWX2sbw8MMPKzs7WwsWLJDT6Wz7ulNPPVWPPvqofvOb37R97UUXXaSvf/3rbZ+fd955WrRokWpra5WcnKyGhgb93//9X9uYDpSYmKiZM2fq73//u2688UZJUm1trZYtW6brrrtOkrRixQoNGzZMF198sRwOh6ZNm6akpCR5vd6D/j0+++yzamxs1MKFCzVw4EBJ0ujRo3XhhRdq9erVysvLa/valJQU5efnS5Ly8/Pbfn/qqafqb3/7m2644QZZlqUdO3boww8/1EMPPXTwf8CvUFRUpPPOO6/DMv7JkyfrqKOO0vLlyzVx4kQVFBRIMku2x4wZ06lGfn6+UlJSJCmoJeibNm3SP//5T917771tofNxxx2nWbNmqaioqO3rHn74YR1++OH6/e9/3/bvP3HiRJ155pn661//qosvvjjonxsAACAcmCkJAABwiAKhTktLyyF9fXx8vJ5++mmdccYZ2rlzpz788EO9+OKLWrZsmSQTWnblww8/lN/v18knn6zm5ua2XyeffLJ8Pp8+/vhjffrpp2poaOgQIkrSWWed1fb7uro6rVmzRqeffnrb2CUpLS1NJ510klasWNHhewMBWsAFF1yguro6/fvf/5Yk/fvf/1ZdXZ3OPffcg/7M55xzjrZt26bPPvtMkjlZurGxUWeffbYkafr06dqyZYvOP/98Pf7441qzZo1mzZql2bNnH7Tmxx9/rEmTJrUFkpI0ePBgLVu2TCeccMJBv29/3/jGN/T555+3nci9ePFiJScn69RTTz2k7+/KFVdcoZ///Oeqra3V2rVr9fe//12///3vJR383zZcAj/H/qd4OxwOzZw5s+3z+vp6rV69WieccIL8fn/b4+iwww5TXl6e3nvvvR4dIwAAwJdhpiQAAMAh8ng8Sk5OVllZ2UG/pq6uTk1NTfJ4PJKk//73v5o/f742b96s5ORkjR49um1pt9/v77JGYEnymWee2WX/zp072+r369evQ1///v3bfl9dXS2/368BAwZ0qjFgwABVV1d3aAuMKyA7O1vTpk3T4sWLde6552rx4sU65phjNGjQoIP9+DrqqKM0aNAgvf7665owYYJef/11TZs2rW026BlnnKHW1la98MILbUuehw4dqh/96Edts0m7+vs41NmpBzN9+nQNGzZMixcv1tSpU7V48WKdccYZcrvdIdfcu3ev7rnnHr3xxhuyLEvZ2dk68sgjJR383zZcAjNLMzIyOrTvH9xWVVWptbVVTz31lJ566qlONbrzswMAAHQXoSQAAEAQvva1r2n58uXy+Xxdhjovv/yyHnzwQf3lL39RamqqrrvuOs2YMUO///3vddhhh8myLD3//PP673//e9A/Iy0tTZI5ECY5OblTf1ZWVtuhLXv27FFubm5b3/4HmKSmpsqyLO3evbtTjV27dik9Pf0rf94LLrhAt99+u4qLi/XBBx/ol7/85Zd+vcPh0KxZs/Taa6/pmmuu0Xvvvaf777+/w9ecddZZOuuss1RdXa13331XTz31lH784x/riCOO6DLwTE1N7fJglg8++EDDhg2TZVlf+XNYltW2HP3CCy/Uli1b9OCDD37l932ZH/3oR9q8ebOeffZZTZ48WfHx8aqvr9fLL7/crbqBn6e1tbVDe21tbdvvA2Hk7t27lZWV1da+/x6bycnJsixL3/ve97oMuA+2LyoAAEAksHwbAAAgCJdddpn27dunRx55pFPfrl279Mwzzyg/P19jx47V2rVr5fP5dNVVV2n48OFtYVM
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1600x4500 with 3 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import seaborn as sns\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"\n",
|
|||
|
"# Настройка стиля графиков\n",
|
|||
|
"sns.set(style=\"whitegrid\")\n",
|
|||
|
"\n",
|
|||
|
"# Создание фигуры\n",
|
|||
|
"plt.figure(figsize=(16, 45))\n",
|
|||
|
"\n",
|
|||
|
"# График 1: Категория vs Цена\n",
|
|||
|
"plt.subplot(4, 1, 1)\n",
|
|||
|
"sns.scatterplot(x=df_cleaned['state'], y=df_cleaned['city_latitude'], alpha=0.6, color='blue')\n",
|
|||
|
"plt.title('Category vs city_latitude')\n",
|
|||
|
"plt.xlabel('state')\n",
|
|||
|
"plt.ylabel('city_latitude')\n",
|
|||
|
"plt.xticks(rotation=90)\n",
|
|||
|
"\n",
|
|||
|
"# График 2: Подкатегория vs Цена\n",
|
|||
|
"plt.subplot(4, 1, 2)\n",
|
|||
|
"sns.boxplot(x=df_cleaned['city'], y=df_cleaned['city_latitude'], color='green')\n",
|
|||
|
"plt.title('Sub-Category vs city_latitude')\n",
|
|||
|
"plt.xlabel('Sub-Category')\n",
|
|||
|
"plt.ylabel('city_latitude')\n",
|
|||
|
"plt.xticks(rotation=90)\n",
|
|||
|
"\n",
|
|||
|
"# График 3: Категория vs Подкатегория\n",
|
|||
|
"plt.subplot(4, 1, 3)\n",
|
|||
|
"sns.countplot(x=df_cleaned['state'], hue=df_cleaned['city'], palette='Set3')\n",
|
|||
|
"plt.title('Category vs Sub-Category')\n",
|
|||
|
"plt.xlabel('state')\n",
|
|||
|
"plt.ylabel('Count')\n",
|
|||
|
"plt.xticks(rotation=90)\n",
|
|||
|
"\n",
|
|||
|
"# Упорядочиваем графики\n",
|
|||
|
"plt.tight_layout()\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### Стандартизация данных для кластеризации"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 51,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
" city_latitude state_AB state_AK state_AL state_AR state_AZ state_BC \\\n",
|
|||
|
"0 -0.538951 0.0 0.0 0.0 0.0 0.0 0.0 \n",
|
|||
|
"1 -0.031639 0.0 0.0 0.0 0.0 0.0 0.0 \n",
|
|||
|
"2 -0.610081 0.0 0.0 0.0 0.0 0.0 0.0 \n",
|
|||
|
"3 -0.985300 0.0 0.0 0.0 0.0 0.0 0.0 \n",
|
|||
|
"4 -0.577732 0.0 0.0 0.0 0.0 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" state_CA state_CO state_CT ... city_Winchester city_Winnsboro \\\n",
|
|||
|
"0 1.0 0.0 0.0 ... 0.0 0.0 \n",
|
|||
|
"1 0.0 0.0 0.0 ... 0.0 0.0 \n",
|
|||
|
"2 0.0 0.0 0.0 ... 0.0 0.0 \n",
|
|||
|
"3 1.0 0.0 0.0 ... 0.0 0.0 \n",
|
|||
|
"4 0.0 0.0 0.0 ... 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" city_Winston city_Woodburn city_Woodland city_Woodland Park \\\n",
|
|||
|
"0 0.0 0.0 0.0 0.0 \n",
|
|||
|
"1 0.0 0.0 0.0 0.0 \n",
|
|||
|
"2 0.0 0.0 0.0 0.0 \n",
|
|||
|
"3 0.0 0.0 0.0 0.0 \n",
|
|||
|
"4 0.0 0.0 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" city_Woonsocket city_Yarmouth city_Yelm city_Yuma \n",
|
|||
|
"0 0.0 0.0 0.0 0.0 \n",
|
|||
|
"1 0.0 0.0 0.0 0.0 \n",
|
|||
|
"2 0.0 0.0 0.0 0.0 \n",
|
|||
|
"3 0.0 0.0 0.0 0.0 \n",
|
|||
|
"4 0.0 0.0 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
"[5 rows x 636 columns]\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from sklearn.preprocessing import OneHotEncoder, StandardScaler\n",
|
|||
|
"import pandas as pd\n",
|
|||
|
"\n",
|
|||
|
"# Выделяем числовые и категориальные признаки\n",
|
|||
|
"numerical_cols = ['city_latitude']\n",
|
|||
|
"categorical_cols = ['state', 'city']\n",
|
|||
|
"\n",
|
|||
|
"# Масштабирование числовых признаков\n",
|
|||
|
"scaler = StandardScaler()\n",
|
|||
|
"df_numerical_scaled = scaler.fit_transform(df_cleaned[numerical_cols])\n",
|
|||
|
"\n",
|
|||
|
"# Кодирование категориальных признаков с помощью OneHotEncoder\n",
|
|||
|
"encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False) # sparse=False для удобства\n",
|
|||
|
"encoded_data = encoder.fit_transform(df_cleaned[categorical_cols])\n",
|
|||
|
"\n",
|
|||
|
"# Создаем новые столбцы для закодированных категориальных признаков\n",
|
|||
|
"encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_cols))\n",
|
|||
|
"\n",
|
|||
|
"# Объединяем числовые и закодированные категориальные данные\n",
|
|||
|
"df_encoded = pd.concat([pd.DataFrame(df_numerical_scaled, columns=numerical_cols), encoded_df], axis=1)\n",
|
|||
|
"\n",
|
|||
|
"# Выводим результат\n",
|
|||
|
"print(df_encoded.head())"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 52,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAABR8AAAP0CAYAAAAjkkunAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdeXycZb3///c9W2Yy2dp0IW1a6AJll6XsqyCIiAjigiJ+9cAPFTwqHBU9KsJREBU4CgKK4oZ4QBTZ3A6boKhA4agIWqAttGnTJk2aTGZf7vv3R0xImkkyk8w9933PvJ7nwcPTzGRyzcy9XNf7/lzXbViWZQkAAAAAAAAAKszndAMAAAAAAAAA1CbCRwAAAAAAAAC2IHwEAAAAAAAAYAvCRwAAAAAAAAC2IHwEAAAAAAAAYAvCRwAAAAAAAAC2IHwEAAAAAAAAYAvCRwAAAAAAAAC2IHwEAACocZZlOd0ER7jpfU/XFje11Ul8DgAA1B7CRwAAMGPnnnuuVq1aNe6/fffdV8cff7yuuOIKDQ4OTvidDRs26PLLL9cb3vAG7b///jr++ON1ySWX6J///Oekf+e///u/tWrVKn3xi1+ctk0f+MAHdOihhyqbzU76nLe85S0655xzJEmrVq3SDTfcUMK7raxPf/rTOuGEE0b/fcIJJ+jTn/50Rf/G1q1bdcEFF2jz5s22/p3ZiMfj+tCHPqTXve51OuSQQ/TKK6+U/RpPPvmkVq1apSeffFJS8fddCXfffbdWrVqlrq6ukn8nFovpU5/6lNasWTP6s3PPPVfnnnvu6L/vuusufeUrX6lIG3fermbzOjvv22P/+81vflOB1r4mm83qqquu0v3331/R1wUAAM4LON0AAADgbXvvvbe+8IUvjP47l8vp+eef13XXXad//OMf+p//+R8ZhiFJ+t///V996lOf0u67764Pf/jD6uzs1NatW/XDH/5Q73znO3XzzTfrqKOOGvf6pmnqnnvu0R577KF7771Xn/jEJxSJRCZtz1lnnaU//vGPevzxx/WGN7xhwuPPP/+8XnzxxdGw584779Quu+xSiY9iVr75zW+qqampoq/5xz/+UY899pjtf2c27rnnHj366KO67LLLtPvuu6uzs7Ps19hnn3105513auXKlZKKv2+n/OMf/9C9996rs846a/RnY/cXSbr55pt16KGHVrtp05o/f76++c1vFn1st912q+jf6unp0Q9/+EN9+ctfrujrAgAA5xE+AgCAWWlqatIBBxww7meHHHKIEomErr/+ev31r3/VAQccoI0bN+rSSy/VMccco69//evy+/2jzz/55JP17ne/W5deeqkeeeQRhUKh0cf+8Ic/aOvWrbruuuv03ve+Vw888IDe8Y53TNqek046Sa2trbrvvvuKho+/+MUv1NTUpDe+8Y2SNKHtTtl7771r6u+UamBgQJL0nve8ZzSkLlexbdDNRkJStwuFQp76XAEAgDsx7RoAANhi3333lSRt2bJFknTbbbcpm83qc5/73LjgUZIikYguvfRSnXXWWROmav/85z/XHnvsoYMPPliHHXaY7rzzzin/bkNDg0477TT97ne/UzweH/dYLpfTL3/5S735zW8erZ7cedr1D3/4Q51yyinab7/9dMwxx+jyyy8ffZ2uri6tWrVKd99997jX3Xmqa6FQ0C233KLTTjtN+++/vw444ACdffbZ+vOf/zxpu8dOh77hhhsmne460tbp/sbdd9+tz3zmM5KkE088cfS1d552PTQ0pC9/+ct6wxveoP3220+nnXaafvazn01o2/XXX6+vfOUrOvLII7X//vvrvPPOm3aKdCaT0Y033jj6eZ588sm65ZZbZJqmpOHpxyPvZ88995xyOvhf/vIX/du//ZsOOuggHX744brkkku0bds2SeOnXRd731/5yle0//77a2hoaNxr3nTTTTr44IOVSqWmfB9Tueuuu/S2t71NBxxwgPbff3+99a1v1a9//evRdr3vfe+TJL3vfe8bnWo9dtr1CSecoM2bN+sXv/jF6JTuke9/Zztvq4ODg/rMZz6jQw89VIcccoi+9rWvjX62Yz300EN629vepv32209HHXWUvvSlLymZTM74Pc/k9R966CG95z3v0YEHHqh9991Xp5xyim6//XZJw/vViSeeKEn6zGc+M7ov7Tw9XZo4xf7uu+/W3nvvrbvuuktHHXWUDj30UL388stVed8AAKA0hI8AAMAWGzZskCQtWbJEkvT73/9ee++9txYuXFj0+UcccYQuvvhizZ8/f/RnAwMDeuSRR3TGGWdIks4880w999xzev7556f822eddZYymYx++9vfjvv5448/rv7+/kkrJx944AF97Wtf0znnnKNbb71VF110ke69996S1poc65prrtFNN92kd73rXfrud7+rL37xixoYGNDHPvaxkoKud7zjHbrzzjvH/XfwwQcrGo3q1FNPLelvHH/88frwhz8saXiq9YUXXjjh76TTab3nPe/R/fffr/PPP380jPvsZz+rb33rW+Oe+6Mf/Ujr16/Xl7/8ZX3pS1/S3//+d1166aWTvgfLsvShD31I3/3ud/WOd7xD3/rWt3TKKafo61//+ui04y984Qt6+9vfLml4+nuxNkrSCy+8oPe+973KZDL66le/qiuuuEJ///vfdd555ymfz497brH3/fa3v12ZTGbCOoX33nuvTj311Cmn8U/l9ttv12WXXaY3vOEN+va3v61rrrlGoVBIn/jEJ7R161bts88+uuyyyyRJl1122YTp1iNtnD9/vo477jjdeeedWrBgQUl/2zRNnX/++Xrsscd06aWX6uqrr9azzz6rX/3qV+Oed//99+uiiy7S8uXLdeONN+ojH/mI7rvvPl144YUl3dwln89P+G/s75Xy+r/73e900UUXaZ999tFNN92kG264QUuWLNF//dd/6a9//asWLFgwOr37wx/+8KRTvSdTKBT0ve99T1deeaU+85nPaMWKFbN+3wAAoHKYdg0AAGbFsqxxAdDg4KCeeuop3XzzzaNVTtLwTUD22muvsl77/vvvl2maeutb3yppeHr2f/3Xf+mOO+6YMhDcZ599tNdee+n+++8ft9bePffco1WrVmm//fYr+ntPPfWUOjs7dc4558jn8+nQQw9VY2Nj0RvnTKWnp0cXX3zxuKqthoYG/fu//7vWrl077VTWXXbZZdw6lD/4wQ/07LPP6pvf/KZWrFhR8t9YunSpJGmvvfYqupbi3XffrRdffFF33HGHDjzwQEnSMccco3w+r5tuuklnn3222traJEktLS266aabRqtWN27cqBtuuEE7duzQnDlzJrz2448/rj/+8Y+67rrr9OY3v1mSdNRRRykcDusb3/iG3ve+92n33XcffZ9TfSbf+ta31NbWpu9973tqaGiQJC1YsED/8R//oZdeemncc+fOnVv0fR944IG69957R4PnZ599Vq+88oquvvrqSf/udDZt2qTzzjtvXGi6ePFive1tb9MzzzyjN7/5zaNTrFeuXFl0uvXee++tUCikuXPnljXF+fHHH9ff/vY3fec739Gxxx4raTjAH1uBa1mWrrnmGh1zzDG65pprRn++22676f3vf78ee+wxHX/88ZP+jc2bN2ufffaZ8PP/+I//0AUXXFDy67/88ss688wz9dnPfnb0OQceeKAOO+wwPfnkk3rd6143emxYunTpjJYG+NCHPjT6Xmb7vgEAQGURPgIAgFl5+umnJwQUPp9PRx55pP7rv/5rdB0/v9+vQqFQ1mv//Oc/12GHHaZQKKRYLCZpeJrqAw88oEsvvXTKG6ecddZZuuqqq7Rt2zYtXLhQAwMDevTRR/WpT31q0t85/PDDdeedd+ptb3ub3vCGN+i4447TW97ylrLXIrz22mslSf39/Vq/fr1effVVPfroo5I05V24i/n973+vr371q7rwwgvHrWFZib/x1FNPafHixaPB44jTTz9dP/vZz/TXv/5Vxx13nCRpv/32GzddfiQ0TKVSRcPHp556SoFAQKeccsqE1/7GN76hp556SrvvvntJ7XzmmWd
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1600x1200 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from sklearn.decomposition import PCA\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"\n",
|
|||
|
"# Применение PCA ТОЛЬКО к числовым данным\n",
|
|||
|
"pca = PCA(n_components=1)\n",
|
|||
|
"kc_pca = pca.fit_transform(df_numerical_scaled)\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация\n",
|
|||
|
"plt.figure(figsize=(16, 12))\n",
|
|||
|
"plt.scatter(range(len(kc_pca)), kc_pca, alpha=0.6)\n",
|
|||
|
"plt.title(\"PCA Visualization of city_latitude Feature\")\n",
|
|||
|
"plt.xlabel(\"Sample Index\")\n",
|
|||
|
"plt.ylabel(\"Principal Component 1\")\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### Агломеративная (иерархическая) кластеризация"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 53,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAABSAAAAPyCAYAAABsM3EPAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdeZxd8/0/8NdEMtYgYok1JLGT0BAUQahq0Ta0qnaqtqC1lKD2tZogxN4WsVQVRVutFhW+RVI7rVIJQYgtklgzWe7vj/zmdiYzicmYk7mTeT4fjzzOzPmce877fc+ZmcxrzlJVKpVKAQAAAAAoQIfWLgAAAAAAWHAJIAEAAACAwgggAQAAAIDCCCABAAAAgMIIIAEAAACAwgggAQAAAIDCCCABAAAAgMIIIAEAAACAwgggAQAAaFSpVGrtEsqaUksl1QvA/wggARYAgwcPztprr93ov8GDB7d2eUAdkydPTt++ffPcc89l8uTJOfzww/PrX/+6tcuiAjz33HP5+te/npqamtYupSI88MAD2W233fLWW2/lzTffzIABA/LCCy+0dlkV780338zaa6+dyy67LHfeeWfWXnvtjBo1qkmvHTx4cAYMGFD+/IEHHsiJJ57Y4jXuu+++2XfffefpNU8++WQOOeSQ8ue1fd55551JkilTpuSEE07IE0880SI11r6HTTVq1KhyPZdddlnWXnvtvPnmm6mpqclOO+2UZ555pkXqAmirOrZ2AQC0jOWWWy7Dhw+vN+/II49spWqAOVlqqaVy4IEHZo899kipVMraa6+dn//8561dFq1s6tSpOfHEE/PTn/401dXVrV1ORdhmm21yyy23ZLvttkuSDBw4MBtssEErV1X5qqqqytO6HzfFEUcckf3226/8+fXXX9/i9TXX7373u4wZM6b8+fLLL5/f/va3WW211ZIkL774Yu6+++7svvvurVLfnN736urqHH/88TnxxBNz9913Z5FFFmmV+gBamwASYAEwY8aMLLbYYtloo43qzfdLLFSmI488MnvuuWemTJmS7t27Z6GFFmrtkmhlt9xySzp27JgddtihtUupGB07dsyvfvWrvPHGG1looYWy0kortXZJbcJyyy2Xjh07ZoUVVki3bt2SJCuuuGKTXlsb5rUF1dXVDf7f05pq3+Nu3bplxowZ6dixY5ZbbrkkyQ477JBLLrkkv/nNb3LggQe2ZpkArcYl2AALgOnTpzf5L+pPPPFE9tlnn/Tp0yf9+vXLiSeemIkTJ5bHay/XevPNN+u9bsCAAfUu5542bdocL/uefV3PPvtsBg4cmN69e2fXXXfNX/7yl3rr/uijj3L++ednhx12yIYbbphddtklt99+e4Ptz76dN998M/vuu28GDx6cq666Kl/96lfTt2/fHHHEERk/fny9199///3Za6+9svHGG2eDDTbITjvtlJtvvrk8Xnvp1Nprr50nn3yy3mtvuummrL322vUuS6ut52c/+1m9ZSdPnpwNNtigwSVvX7T9Ofnd736X3XbbLRtttFF69+6db3/72/nzn//c4D1u7LL7Oe2f2S97u/fee7Pbbrtl4403zpZbbpnTTjstkydPLo/XXkq28cYbN7g09Oijj25wqf/UqVNz4YUXZptttskGG2yQXXfdNffee2+91w0YMCAXX3xxzjvvvGy66abZbLPNcsIJJ2TSpElN7n9utx648847y/u07n744IMPsskmmzS6L9dee+2ss8462XTTTXPUUUflww8/LC/T2KV4te9Lc97LJFl22WXTo0ePPProo194u4TZt/WnP/0pm266aYYOHZqk/vE7+7+6df/nP//JkUcemc033zzrr79+tt5665xzzjn5/PPPy8vU1NTkkksuyfbbb5/evXtnl112ye9///smvedJ8tZbb+XYY49Nv3790qdPn+y///7597//XV5/7WWTf/rTn3LYYYelT58+2XbbbXP55Zdn5syZ9fbL7O/JscceW2+flkqlDBs2LFtvvXX69u2bww47LG+//XZ5+RkzZuSaa67JLrvskt69e2ejjTbKnnvumccff3yu+zFpuM9n/7xUKmXPPfes9/1y9stXk+TWW2/9wks5a2pqct1112WXXXYpz2vsEtXZj+nGan/kkUcaHE8ff/xxzj777Gy99dbZaKONsvvuu+ehhx5qsN4vOn6mTp2ayy+/PDvttFM23HDD7Ljjjrnmmmvq7bd999233uu/8pWv5KCDDsobb7wxz+up7X/VVVfNSiutlKFDh9Y71hoz+/Zn/5mRzNpP++67b26//fZst9122XjjjbP//vvnP//5T3k9jf0s/O9//5v111+/3n558cUXs/fee2fjjTfODjvskFtvvXWO+6ux3pJk4sSJOfPMM7Pddttlgw02SL9+/TJo0KB62579Neeee2423HDDPPzwww3eg+rq6vTo0SO9evXKOuusk8UWWyyrrLJKklnH7fXXX59vfOMb6d27d772ta/lV7/6Vfm+iXWP4X333TejR4/O6NGjs/baa+fRRx/NVlttleOOO67BNnfccccGPw/nxRe9B4MHD87vf//7jB8/vnwM1L0Ee9SoUeUzN/fbb7/ye9XY95HG9u3o0aPz/e9/P3369MnXv/71PProow1q/KKfbausskoWX3zx9OzZM+uss066d+9e7w/Bu+66a6677jq3WADaLWdAAiwAPvvssyy11FJfuNw///nPHHjggdl8881zySWXZPLkyRk2bFj222+/3H777fN0WdDUqVOTJFdeeWWWWWaZJLPCotmDwyQ59NBDs88+++SYY47J7bffnp/85Ce5+uqrs8022+Tzzz/PXnvtlQ8++CBHH310Vl555dx///055ZRT8v777+ewww4rr2ebbbbJEUccUf58+eWXTzLrHlVdunTJz372s8ycOTNDhw7Nvvvumz/96U9ZdNFF89BDD2XQoEHZb7/9ctRRR+Xzzz/PLbfckrPOOisbbLBB+vTpU17n4osvngcffDB9+/Ytz7v33nvToUPDv9ktvvjieeihh1IqlcqXW/31r3/NjBkz6i03L9uv6+abb84555yTo446Kn379s3kyZNz7bXX5vjjj8/GG29cPrMlSYYPH14+06J2fyTJd7/73Xzve98rf37mmWfW28YVV1yRSy+9NHvttVeOOeaYvPHGGxk2bFieeeaZ3HbbbfWOiaqqqjz22GPZZpttkiSffPJJRo4cWe+9KZVKGTRoUJ566qkcffTR6dmzZ/72t7/lmGOOSU1NTb7zne+Ul73lllvSvXv3nH/++Zk4cWKGDh2acePG5dZbb01VVdUX9n/EEUdkzz33TDLrjML11luvfHysttpq+e9//9vgPR06dGg++uijLLnkkvXm1x5b06ZNy5gxY3LhhRfm3HPPzZAhQxrdN42Zl/ey1rRp03Leeec1eRtJ8vnnn+ess87KwQcfnF133bXe2GmnnZb111+//Pn3v//98sfvvvtu9t5772y00Ua54IILUl1dnYcffjjXXXddll9++fK91Y4//viMHDkyhx9+ePr06ZORI0dm8ODB6dSp0xe+5xMnTsyee+6ZRRddNKeeemoWXXTR3HDDDdl7771z++23p2fPnuV6zjjjjGyzzTa57LLL8uSTT2b48OH59NNP89Of/rTRvp944on86U9/qjfv+uuvz9VXX50TTjgha6yxRi644IL8+Mc/zm233ZYkGTJkSH7zm9/kuOOOy9prr5133nknl19+eX784x/noYceyqKLLjpP731dd999d55++um5LjN58uRccsklX7iuUaNG5Z133smOO+7Y7HqSxo+nGTNm5KCDDsprr72Wo48+Oj169Mjvf//7DBo0KDfccEM22WST8rJzO35KpVIOO+ywPPPMMznyyCOzzjrrZNSoUbnkkkvyxhtv5Oyzzy4vu9566+X000/P9OnT8+abb2bo0KE54YQT8pvf/Gae1lPX66+/3uTLgWu3X+uhhx7KlVdeWW+ZF198MWPHjs2xxx6bpZZaKpdeemn22Wef3HvvveWfLbM799xzM3369PLnn332WX70ox9l5ZV
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1600x1200 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
|
|||
|
" 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
|
|||
|
" 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
|
|||
|
" 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
|
|||
|
" 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
|
|||
|
" 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
|
|||
|
" 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
|
|||
|
" 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
|
|||
|
" 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
|
|||
|
" 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
|
|||
|
" 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
|
|||
|
" 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
|
|||
|
" 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
|
|||
|
" 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
|
|||
|
" 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
|
|||
|
" 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
|
|||
|
" 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
|
|||
|
" 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
|
|||
|
" 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
|
|||
|
" 1 1 1 1 1 1 1 1 1]\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Построение дендрограммы (только для числовых данных)\n",
|
|||
|
"linkage_matrix = linkage(df_numerical_scaled, method='ward')\n",
|
|||
|
"\n",
|
|||
|
"plt.figure(figsize=(16, 12))\n",
|
|||
|
"dendrogram(linkage_matrix)\n",
|
|||
|
"plt.title('Дендрограмма агломеративной кластеризации (числовой признак \"city_latitude\")')\n",
|
|||
|
"plt.xlabel('Индекс образца')\n",
|
|||
|
"plt.ylabel('Расстояние')\n",
|
|||
|
"plt.show()\n",
|
|||
|
"\n",
|
|||
|
"# Получение результатов кластеризации (только для числовых данных)\n",
|
|||
|
"result = fcluster(linkage_matrix, t=100, criterion='distance') \n",
|
|||
|
"print(result) # Вывод результатов кластеризации (номера кластеров для каждого образца)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 54,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAABJ8AAAMQCAYAAACJzMTyAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdeZxVdf0/8NcdZgY3EFIURQHRRA1x+apppblVmuVXs0VTU8ufkksuqWlpufQ1U9wpl8xc0hZz/2b5zSWt3LfS3BIQFQVXQBSYhfv7A2ecgQHmDnOYGXg+Hw8enDn3vO9533vP3Ln3dT/nc0vlcrkcAAAAAChAVVc3AAAAAMCSS/gEAAAAQGGETwAAAAAURvgEAAAAQGGETwAAAAAURvgEAAAAQGGETwAAAAAURvgEAAAAQGGETwAAAAAURvgEQMX23Xff7Lvvvm1edvPNN2f48OE5/vjjF3NXQEeNGjUq1113XVe3QSf5wx/+kIMOOqir2wCAZsInADrNm2++mdNPP72r2wAqcMMNN2Ty5MnZY489uroVOskee+yRN954I3/4wx+6uhUASCJ8AqATnXrqqXn//fez3HLLdXUrQDvMnDkzo0ePzqhRo1JV5WXhkqJUKuXggw/OOeeck5kzZ3Z1OwAgfAKgc/zf//1fbr/99hx66KHp379/q8tmz56dSy+9NJ/5zGcyYsSIfO5zn8vVV1/dapt99903xx9/fC6++OJ84hOfyH/913/lkEMOycSJE1ttd8cdd+TrX/96Ntlkk4wYMSI77bRTrrnmmubLH3zwwQwfPjx///vfs/fee2fkyJH57Gc/m2uvvbZ5m5/85CcZPnx4HnjggeZ1N9xwQ4YPH56bbrqpuZ+5Ty08++yzM3z48Nxwww1JkuHDh+fCCy9stc2FF16Y4cOHz9Pzl770pWy44Yb55Cc/mR//+Md5//33W23zxBNP5Jvf/GY23XTTbLnlljn66KMzefLkVrfpwQcfTJI8//zz2XHHHbPnnnu2+35JkksvvTQ77LBDNthggwwfPrz539y3oaXjjz8+22+/ffPPl19+eTbZZJP89re/bXW/tfWv6X5Kkocffjjf+ta3svnmm2fEiBHZfvvtc+GFF2b27NnN20yfPj2nnXZatt5662y88cbZY4898te//rX58ZjfflreLwcffHA23XTTbLrppjn00EPz8ssvN19/e46NZN7HtVwuZ88998zw4cPzyiuvJElmzZqVk08+OVtttVU+/vGP55hjjsnUqVOba2bOnJmzzz47n/3sZzNixIhsuummOeCAA/LMM8/M975NkldeeaXVfTf3z0373mGHHVodZ//5z3+a79+5758Fuf766zNr1qxst912rdafe+65bd7Xcx8r1113XXbZZZeMGDEi2267bS688MI0NjZWdBuT5B//+Eeb+2v5O3j88cdn3333zR/+8Idst9122WSTTbLffvvl2WefbXX9L774Yr7zne/kk5/8ZDbeeOPsu+++efTRR+fZf8t/TT0OHz48v/71r/O9730vm2yyST7xiU/kf/7nfzJr1qzm+sbGxlx66aX5whe+kJEjR2bjjTfOnnvu2er5pOl5YJNNNkldXV2r/r7zne+0OjW5ZT+33HJLq23vvvvueR7H9uw/SbbbbrvMmjUr119/feZnQb+/LR/r119/PSeccEI+/elPZ+TIkfnyl7+cO++8c77X2/K6m35nmmy//fatTstu79+HuZ+P535eTBb+HABA16nu6gYA6PmmTJmSU045JR/72Mdy4IEH5ve//32ry08++eTccMMNOfjgg7PJJpvk4Ycfzumnn55p06bl0EMPbd7uzjvvTP/+/XPiiSdm9uzZOfvss7Pvvvvmj3/8Y5Zddtn89a9/zaGHHppvfOMbOfzwwzNz5sxce+21OfXUUzNixIhstNFGzdd11FFHZbfddsuoUaNy55135pRTTkmSfP3rX89RRx2Vv/71r/nRj36UW2+9NW+++Wb+53/+JzvvvHN22223Nm/jSy+9lCuuuKLi++bWW2/NMcccky9+8Ys58sgjM3HixJx77rl54YUX8qtf/SqlUilPP/109tlnn2y00UY588wz09jYmLPPPjvf+ta3msOwls4666yMGDEi3/72t5OkXffLTTfdlLPPPjsHH3xwttpqqyy77LJJkq997Wvtvi2TJ0/OOeeck1NPPTWf/vSnW102ZsyYDBgwIEnyxhtv5LDDDmu+7Nlnn83++++fnXbaKeeee27K5XJuvfXWjBkzJsOGDcsuu+ySxsbGfPOb32wODoYNG5Ybb7wxhx56aK688sr86Ec/yvTp05t7/vKXv5yvfOUrSZJ11lkn48ePz5577plhw4blpz/9aRoaGnLRRRdlr732ys0335yVVlqpuZ8FHRttufnmm/P444/P8xjcdNNNOemkk9K3b9+ccsopOfnkk3PuuecmSY477rg88sgjOfroozN48OBMmDAh559/fr773e/mj3/8Y0qlUrvv97lddtll87yh//a3v53a2tqcdtppWWWVVVJVVZXrrrtuoadd3XLLLdl2221TW1vbav3MmTOz/fbb5+CDD25eN/excskll+Tcc8/NPvvskxNOOCHPPPNMLrzwwrz22msVn347c+bMDBw4MOeff37zuqbHpaVnnnkm48aNy9FHH50VV1wxF1xwQfbZZ5/cdtttWWWVVfLCCy/kq1/9aoYOHZoTTzwxNTU1ueqqq7Lffvvl8ssvzxZbbNHqPtt2222TpNXtP//887PRRhvlvPPOy9ixY3PeeefljTfeyHnnnZckGT16dH7zm9/ku9/9boYPH57JkyfnZz/7WY444oj89a9/bf7dSuaMQLr//vubf1/ee++93HPPPW2OMlt++eVz1113Zdddd21ed9ttt6WqqqpVSNve/ffu3Tvbbbddbr311uy9994LvP9b/v4mrR/rN998M1/+8pfTu3fvHHXUUenfv39uuOGGHHrooTnzzDNb9dsR7f37sDCVPAcAsPgJnwBYZKeffnqmTp2aX/7yl6mubv2nZfz48fn973+fo48+unkC3E996lMplUq55JJL8vWvf715pNSMGTNyww03ZM0110ySDBs2LLvvvntuuumm7LXXXnnhhRey++675wc/+EHz9W+yySb5+Mc/ngcffLBV+PSZz3ymebutt946r7/+en7+859nr732yjLLLJMzzjgjX//613PppZfmscceyworrNDmm92Wt/GjH/1o/v3vfzevq6qqSkNDw3xryuVyRo8ena233jqjR49uXj906NDsv//+ueeee7Ltttvm4osvTr9+/XL55Zend+/eSZJVVlkl3/3ud/Of//yn1XVOmDAhf//733PLLbfkox/9aJK0637517/+lX79+uXoo4+eb78L89vf/jbrrbdevvSlL81z2frrr5811lgjSeYJRp599tl84hOfyFlnndX8pvuTn/xk7rrrrjz44IPZZZddcu+99+af//xnfvazn2XHHXdMkmy55ZZ5+eWX88ADD7QKs5Jk4MCB2XjjjZt//tGPfpRll102V1xxRVZYYYUkyVZbbZUdd9wxl112Wb73ve81b7ugY2PuUOi9997L6NGj87GPfazVY18ul3Pcccc1z5P02GOPNU/YXVdXl/feey8nnnhiPv/5zydJtthii0yfPj1nnHFG3nzzzVZv9Cvx2muv5Re/+EWrft5+++28/PLLOemkk7LTTjs1b/u3v/1tgdc1ffr0PPnkk9l5553nuWzGjBlZffXVW93HLb377rv5+c9/nq997Ws58cQTk8z5ve7Xr19OPPHEHHDAAc3HZ3vMmDEjffv2bbW/psdx7v1efPHF2WyzzZIkI0eOzI477pirrroqxxxzTMaMGZPa2tpcddVVzfXbbrttvvCFL+TMM89sFcYNHjy4zdv3kY98JBdffHGqq6vz6U9/OlVVVfnJT36Sww8/PGuvvXZef/31HHXUUa1G4vTu3TuHH354nnvuuVbXuc022+TOO+9sDp/uuuuuDBgwoFWY1HLbv/3tb6mrq0ttbW1mzZqVO++8M5tvvnm
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1200x800 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Выбираем подмножество данных для кластеризации\n",
|
|||
|
"features = df_encoded[['city_latitude']]\n",
|
|||
|
"\n",
|
|||
|
"# Масштабирование числовых признаков\n",
|
|||
|
"scaled_features = scaler.fit_transform(features)\n",
|
|||
|
"\n",
|
|||
|
"# Построение дендрограммы\n",
|
|||
|
"linkage_matrix = linkage(scaled_features, method='ward') \n",
|
|||
|
"\n",
|
|||
|
"plt.figure(figsize=(12, 8))\n",
|
|||
|
"dendrogram(linkage_matrix, labels=df.index, leaf_rotation=90, leaf_font_size=10)\n",
|
|||
|
"plt.title('Иерархическая кластеризация (дендрограмма) по цене')\n",
|
|||
|
"plt.xlabel('Индекс товара')\n",
|
|||
|
"plt.ylabel('Евклидово расстояние')\n",
|
|||
|
"plt.tight_layout()\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"**Визуализация распределения кластеров**"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 55,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAACbQAAAuoCAYAAAAwk66tAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzde/zX8+H///tb51SUpKZZmL0rpKSUldJo5kxt2CRWcsiaU2hrNMJSFqUoxzn7znk5b4zNIYcxxzArQiciIpV6//7w6/3xVlGpd8/ser1culze7+fr+Xy8Hq/nK5eHS5fb5fEsKSsrKwsAAAAAAAAAAACsZeut7QkAAAAAAAAAAABAImgDAAAAAAAAAACgIARtAAAAAAAAAAAAFIKgDQAAAAAAAAAAgEIQtAEAAAAAAAAAAFAIgjYAAAAAAAAAAAAKQdAGAAAAAAAAAABAIQjaAAAAAAAAAAAAKARBGwAAAAApKytb21MotyJzKdJ81zXu3erlfgIAAACsXoI2AAAAgP8xp556arp161b++9/+9reccsopq/19evXqlV69eq3UNU8//XT69etX/vtbb72V0tLS3HLLLUmSDz/8MCeffHKeeuqp1TLH0tLSjB49erWMtaY89thjOfbYY9O5c+dst912+fGPf5xhw4blvffeW+mxvnx/vw0mT56cIUOGZNddd02rVq3StWvXnHDCCZk0aVKF87p165ZTTz11tb73a6+9loMPPni1jgkAAADwv07QBgAAAPA/5phjjsmFF15Y/vuVV16ZadOmrcUZ/Z8///nPef3118t/b9SoUW688cZ07do1SfLyyy/n9ttvz+LFi9fSDCvXiBEjcvjhh6dq1ar57W9/m3HjxuXnP/95/vKXv+RnP/vZSn9vX76/67r77rsv+++/f1588cUcffTRueSSS3L88cdnypQp+dnPfpZHHnlkjb7/Pffck2eeeWaNvgcAAADA/5qqa3sCAAAAAFSuzTbbbG1PYYVVr149rVu3XtvTWCvuvPPOXHLJJRk0aFAOO+yw8uMdOnRIly5dsv/+++ess86qECf+L3nzzTdzyimnpHPnzjn//PNTpUqV8te6d++egw8+OKecckoeeOCBVK9efS3OFAAAAICVYYc2AAAAgG+ZsrKyXHnllfnJT36SVq1aZbfddstll12WsrKyJBUfOdqrV6888cQTeeKJJ1JaWppHH300nTp1yoknnrjUuN27d8/gwYNXeV6zZ8/O73//++yyyy7ZZptt0r59+/Tv3z9vvfVW+bxuvfXWvP322+WPGf3iI0cnTpyYQw89NEly6KGHlj/OdFmPkrzllltSWlpaPnaSPPHEEznwwAPLH9v56KOPLjXH+fPn59xzz02XLl2yzTbbZO+9985dd9213M80f/78tG3bNsOGDatw/LPPPkuHDh0ydOjQJMkLL7yQ3r17p23btmnTpk0OO+ywPPvss195v8aPH5/vf//76d2791KvNWvWLAMHDkybNm3Kv9dVub8r+pkXLlyYESNGZOedd06rVq3Sp0+f3HbbbUvd40ceeSQ///nP07Zt2+y444458cQTK+wid8stt6Rly5b585//nB/+8Idp3759rr322pSWlmby5MkV3vP2229PixYtlrsL3dVXX50FCxZk8ODBFWK2JKlVq1ZOOeWU9OjRI3PmzFnq2okTJ6a0tDQTJ06scPzLj8n9qu9t9OjR5THhFx9du3jx4owfPz677bZbttlmm/z4xz/O1VdfvdT7nHTSSRkwYEBat26dww8/PEkyYcKE7LPPPmnVqlU6dOiQk046KTNmzFjm5wcAAAD4thK0AQAAAHzLnHvuuTn33HPTrVu3XHzxxenZs2dGjBiR8ePHL3Xu6aefnpYtW6Zly5a58cYb06pVq+y3337561//mrlz55af9/TTT+eNN97IAQccsEpzKisry5FHHplHHnkkJ510Ui677LIce+yxeeyxx3L66acn+fxRqF26dMnGG29c4TGjS2y99dY57bTTkiSnnXZa+XUr4sUXX8wvf/nL1K1bN6NGjcqhhx6aE044Yak59u/fPzfccEMOP/zwXHTRRWnTpk2OP/743Hbbbcsct0aNGvnxj3+cu+++uzwsSz4Pu95///3su+++mTt3bvr27Zv69etn9OjRGTlyZObNm5c+ffrko48+Wua4s2bNyqRJk9K1a9eUlJQs85yf//zn6dOnT0pKSlb5/q7oZz7ttNPypz/9KYccckjGjBmThg0b5ne/+12F+dx222355S9/mSZNmuSPf/xjBg0alGeeeSYHHnhg3nvvvfLzFi1alMsvvzxnnXVWBg0alL322is1atTI7bffvtR4HTt2TJMmTZb5+f/xj3+kZcuW2WSTTZb5eseOHXP88cdn4403XubrX+frvref/vSn6dmzZ5LkxhtvzE9/+tMkyZAhQzJq1Kjss88+ufjii7P77rvn7LPPzpgxYyqMf/fdd2f99dfPRRddlL59++bpp5/OySefnO7du5fvzPf4448vMy4FAAAA+DbzyFEAAACAb5EPP/wwV111VQ455JAMHDgwSbLTTjtl1qxZefLJJ3PkkUdWOP/73/9+6tSpkyTlj/bs0aNHLrnkktx7773p0aNHks/jombNmmX77bdfpXnNnDmzfNesHXbYIUmy44475s0338yNN96Y5PNHoTZo0KDCY0Y/+eST8jHq1KmT73//++XzXvLzihg3blw22mijXHTRRalWrVqSpH79+jn++OPLz3n00Ufzj3/8IyNHjswee+yRJOncuXPmzZuXESNGZK+99krVqkv/c9q+++6bm2++OU8//XT5Z7vzzjuzxRZbZNttt82zzz6b999/P4ceemj5/dtiiy1y44035uOPP07dunWXGnPJrmRNmzZdoc+3qvf3kUce+drP/M477+TWW2/NKaecUr6TWOfOnfPuu+/mn//8Z5LPdyUbMWJEOnXqlPPOO698Xttvv3322GOPXHbZZTn55JPLjx911FEVgsXddtstd9xxR37961+npKQk06dPz+OPP57hw4cv9zNPnz49LVq0WKH7syr+85//fOX31rhx4zRu3DjJ//23M3ny5Py///f/csIJJ6Rfv35Jkk6dOqWkpCTjxo3Lz3/+89SvXz9JUq1atfz+978vfxzq+PHjU7NmzfTr16/82IYbbpjnn38+ZWVlyw0bAQAAAL5t7NAGAAAA8C3y7LPP5rPPPkv37t0rHB88eHAuvfTSFRpj8803T9u2bct3zPr0009z9913r/LubEmyySab5Kqrrkrbtm3z1ltv5ZFHHsnVV1+df/3rX1mwYMEqj7uinn766XTu3Lk8Zks+f4TqFx9V+dhjj6WkpCRdunTJZ599Vv6nW7dumTVrVl577bVljt2+fft85zvfyZ133pnk80d4/vWvf82+++6bJNlqq63SoEGDHHXUUTnttNNy//33p2HDhhk4cGB5EPVlS8K5xYsXr9DnW9X7uyKfeeLEiSkrK8vuu+9e4dq99tqr/OfJkydn1qxZFY4ln0d0bdq0yRNPPFHh+JdDtJ49e+btt9/OU089leTzgHL99dfPbrvttty5V6lSJYsWLfrqG/MNrMr39vjjj6esrCzdunVb6n7Onz8/Tz/9dPm5W2yxRXm4liTt2rXLvHnzstdee+W8887LU089lU6dOuXYY48VswEAAAD/U+zQBgAAAPAt8sEHHyRJGjRo8I3G6dmzZ37zm99k2rRpefrpp/Pxxx9nv/32+0Zj3nHHHfnjH/+YadOmZcMNN0yLFi1Ss2bNbzTmipozZ075zlhLVK1atcKxDz74IGVlZcvdhW7mzJnL3BGspKQke++9d/785z9n8ODBefDBB/PJJ59k7733TpKsv/76ufbaa3PRRRfl7rvvzo033piaNWtm3333zeDBgytETUs0adIkJSUlefvtt7/yM1WtWjXrr79+klW7vyvymWfPnp0k2WijjSq89sXfl/y9a9iw4VJjNGzYMC+99FKFY7Vr167we4cOHdK0adPcdtttadeuXW677bbsscceqVGjxnLn/p3vfCfvvPP
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 2500x3000 with 4 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"from sklearn.preprocessing import StandardScaler\n",
|
|||
|
"from sklearn.cluster import KMeans\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"\n",
|
|||
|
"# Закодирование категориальных переменных\n",
|
|||
|
"df_encoded = pd.get_dummies(df_cleaned, drop_first=True)\n",
|
|||
|
"\n",
|
|||
|
"# Выбор подмножества данных для кластеризации\n",
|
|||
|
"features = df_encoded[['city_latitude']]\n",
|
|||
|
"\n",
|
|||
|
"# Масштабирование данных\n",
|
|||
|
"scaler = StandardScaler()\n",
|
|||
|
"scaled_features = scaler.fit_transform(features)\n",
|
|||
|
"\n",
|
|||
|
"# Кластеризация данных\n",
|
|||
|
"kmeans = KMeans(n_clusters=3)\n",
|
|||
|
"df_encoded['Cluster'] = kmeans.fit_predict(scaled_features)\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация кластеров\n",
|
|||
|
"plt.figure(figsize=(25, 30))\n",
|
|||
|
"\n",
|
|||
|
"# Парный график 1: city_latitude vs Category\n",
|
|||
|
"category_columns = [col for col in df_encoded.columns if col.startswith('category_')]\n",
|
|||
|
"if category_columns:\n",
|
|||
|
" plt.subplot(4, 1, 1)\n",
|
|||
|
" sns.scatterplot(x=df_encoded['city_latitude'], y=df_encoded[category_columns[0]], hue=df_encoded['Cluster'], palette='Set1', alpha=0.6)\n",
|
|||
|
" plt.title('city_latitude vs Category Clusters')\n",
|
|||
|
" plt.xlabel('city_latitude')\n",
|
|||
|
" plt.ylabel(f'Category ({category_columns[0]})')\n",
|
|||
|
"else:\n",
|
|||
|
" plt.subplot(4, 1, 1)\n",
|
|||
|
" plt.text(0.5, 0.5, 'No category columns found', ha='center', va='center', fontsize=12)\n",
|
|||
|
" plt.title('city_latitude vs Category Clusters')\n",
|
|||
|
"\n",
|
|||
|
"# Парный график 2: city_latitude vs Sub-Category\n",
|
|||
|
"city_columns = [col for col in df_encoded.columns if col.startswith('city_')]\n",
|
|||
|
"if city_columns:\n",
|
|||
|
" plt.subplot(4, 1, 2)\n",
|
|||
|
" sns.scatterplot(x=df_encoded['city_latitude'], y=df_encoded[city_columns[0]], hue=df_encoded['Cluster'], palette='Set1', alpha=0.6)\n",
|
|||
|
" plt.title('city_latitude vs Sub-Category Clusters')\n",
|
|||
|
" plt.xlabel('city_latitude')\n",
|
|||
|
" plt.ylabel(f'Sub-Category ({city_columns[0]})')\n",
|
|||
|
"else:\n",
|
|||
|
" plt.subplot(4, 1, 2)\n",
|
|||
|
" plt.text(0.5, 0.5, 'No sub-category columns found', ha='center', va='center', fontsize=12)\n",
|
|||
|
" plt.title('city_latitude vs Sub-Category Clusters')\n",
|
|||
|
"\n",
|
|||
|
"# Парный график 3: city_latitude vs Category (другая категория)\n",
|
|||
|
"if len(category_columns) > 1:\n",
|
|||
|
" plt.subplot(4, 1, 3)\n",
|
|||
|
" sns.scatterplot(x=df_encoded['city_latitude'], y=df_encoded[category_columns[1]], hue=df_encoded['Cluster'], palette='Set1', alpha=0.6)\n",
|
|||
|
" plt.title('city_latitude vs Category Clusters')\n",
|
|||
|
" plt.xlabel('city_latitude')\n",
|
|||
|
" plt.ylabel(f'Category ({category_columns[1]})')\n",
|
|||
|
"else:\n",
|
|||
|
" plt.subplot(4, 1, 3)\n",
|
|||
|
" plt.text(0.5, 0.5, 'Not enough category columns found', ha='center', va='center', fontsize=12)\n",
|
|||
|
" plt.title('city_latitude vs Category Clusters')\n",
|
|||
|
"\n",
|
|||
|
"# Парный график 4: city_latitude vs Sub-Category (другая подкатегория)\n",
|
|||
|
"if len(city_columns) > 1:\n",
|
|||
|
" plt.subplot(4, 1, 4)\n",
|
|||
|
" sns.scatterplot(x=df_encoded['city_latitude'], y=df_encoded[city_columns[1]], hue=df_encoded['Cluster'], palette='Set1', alpha=0.6)\n",
|
|||
|
" plt.title('city_latitude vs Sub-Category Clusters')\n",
|
|||
|
" plt.xlabel('city_latitude')\n",
|
|||
|
" plt.ylabel(f'Sub-Category ({city_columns[1]})')\n",
|
|||
|
"else:\n",
|
|||
|
" plt.subplot(4, 1, 4)\n",
|
|||
|
" plt.text(0.5, 0.5, 'Not enough sub-category columns found', ha='center', va='center', fontsize=12)\n",
|
|||
|
" plt.title('city_latitude vs Sub-Category Clusters')\n",
|
|||
|
"\n",
|
|||
|
"# Настройка графиков\n",
|
|||
|
"plt.tight_layout()\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## KMeans (неиерархическая кластеризация) для сравнения"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 56,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Центры кластеров:\n",
|
|||
|
" [[48.25734972]\n",
|
|||
|
" [32.43819138]\n",
|
|||
|
" [37.9835445 ]\n",
|
|||
|
" [43.07246158]]\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAABjAAAASgCAYAAABWngGUAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3gUVdvH8d/uJpseEkoSek/oTQgoUkVQBBSwYwELKigqFrADFvSxgDQFBAs2RB7AgmAXG10EpPeahJK+ydZ5/+DJvoYESIDsJuH7uS4ukpmzM/c5md3MyT3nHJNhGIYAAAAAAAAAAABKEbO/AwAAAAAAAAAAADgZCQwAAAAAAAAAAFDqkMAAAAAAAAAAAAClDgkMAAAAAAAAAABQ6pDAAAAAAAAAAAAApQ4JDAAAAAAAAAAAUOqQwAAAAAAAAAAAAKUOCQwAAAAAAAAAAFDqkMAAcEEwDMPfIfhNaar7mWIpTbGWRbRf6cbPBwAAnC8X8n1Faao7/ZsLBz9LwH9IYAAl5NZbb9Wtt95aYHtWVpauv/56NWvWTN9//723bEJCgm688cZTHu/hhx9WQkKCRo8eXWIxlxS73a733ntPAwcO1EUXXaTExETdeOONWrhwYb6bgMmTJyshIeG8ntvhcOill17Sl19+eV6Od6qfa2mRkJCgyZMnSzr/dc9z4MABJSQk6L///W+xXjdt2jTNmjXL+/3JP+81a9Zo6NCh5yXG//73v0pISNCBAwfOy/FKQlHfF0V1cvuWB/fee6/mzZsn6f+vl/fee6/QsqNHj1b37t3P+ZyjR49WQkKC91+jRo3UqlUr9e3bV1OmTFFubm6xj5mUlKShQ4fq4MGDxXrdm2++qTFjxhT7fAAAlAT6N/+P/o3v0L85obT3b1wul9577z31799frVq1UuvWrdW/f3/Nnj1bDoej2Mf798/9bHg8Hs2bN0+DBg1S+/bt1aZNG/Xv319z5sw5q3jmzZunV1555azjAXBuAvwdAHAhycrK0l133aUtW7Zo6tSp6tKli3ef2WzWunXrlJSUpLi4uHyvs9ls+umnn3wd7nlx9OhR3XXXXTp8+LBuvfVWtWjRQh6PRz/99JNGjx6t1atX6/nnn5fJZCqR86ekpOj999/X+PHjz8vxnnvuufNynJIyd+5c7/Vzvut+rt58803df//93u+vu+46derUyfv9vHnztHPnTn+E5nMl8b44uX3Luv/+979KTk7WwIED822fMGGCunXrptq1a5fYuatUqaIpU6ZIOtH5yczM1OrVqzV9+nT99ttvev/99xUUFFTk4/3xxx/65Zdfih3H0KFD1atXL/Xq1UsXX3xxsV8PAEBJo39D/6ak0b8pG5555hl9++23Gjp0qJo1ayaPx6PVq1dr4sSJWrNmjaZOneqzWHJycnTvvffq77//1k033aS77rpLgYGBWr58uf7zn/9o2bJlmjp1qqxWa5GP+dZbbykxMbEEowZwOiQwAB/Ju7nfvHmz3nrrLXXs2DHf/iZNmmjHjh1asmSJBg8enG/fTz/9pJCQEEVGRvow4vNj1KhRSkpK0ty5c1WnTh3v9q5du6patWp644031K1bN1122WX+C7IYGjRo4O8QTqtVq1b+DqHI4uLiCnRmLxTl7X1xvuXm5uq1117Tc889J7M5/2BRq9WqJ598Uh9++GGJ/WHAarUWeC916dJFLVu21PDhwzV79mzdd999JXLufwsJCdHtt9+u8ePH64svvijx8wEAUBz0b8rHfRz9m/PnQu3fHDp0SAsWLNC4ceN0/fXXe7d36tRJFStW1EsvvaT169erRYsWPoln/PjxWrt2rebMmZPv+rn00kvVqFEjPfLII/r000912223+SQeAOeOKaQAH8jOztbdd9+trVu3asaMGQVu7iUpNDRUXbp00ZIlSwrsW7x4sXr16qWAgPw5R4/HoxkzZujyyy9Xs2bN1KtXL82ZMydfGbfbrRkzZqhPnz5q0aKFWrVqpRtvvFHLly/3lpk8ebIuv/xy/fzzz+rbt6/3WAsXLsx3rPfff19XXHGFmjdvrk6dOmnMmDHKyso6Zb03b96s3377TXfeeWe+m/s8gwcP1qBBgxQaGlro67t3715gSPnJQ2dzc3M1ZswYde7cWc2aNdMVV1zhHcZ74MABb8fhiSeeyDe9zOrVq3XLLbeoZcuWSkxM1KhRo3T8+PF852nSpInmzZunjh07KjExUTt27CgwxDohIUEfffSRnnrqKSUmJqp169Z68MEHdfTo0Xxxz5o1S5dddplatGihG2+8UT/++KMSEhK0YsUKb6xFGSabkpKiUaNG6eKLL1br1q11yy236K+//soXz+TJkwut+88//6yEhAT99ttv+Y65evVqJSQkaM2aNac99+msWrVKd955p9q1a6dmzZqpe/fumjx5sjwejzcuSZoyZYr3638PsR49erQWLFiggwcPeodvr1ixIl8b5Tn5Z+DxeDRt2jR17dpVLVu21LBhw5Senl4gxm3btumee+5RmzZt1KZNGw0fPlz79+8/ZZ2+/PJLJSQkaNu2bfm2f//990pISNCmTZsk+eZ9cTbtW9Q679y5U3fffbfatGmjSy65RBMmTNATTzyRr43tdrumTp3qrWfPnj01Y8YM7/mlEz+XRx99VCNGjFCrVq00ZMgQDRw4sNCpIwYPHqwhQ4acso3mz58vu92ubt26FdiX92TjBx98cMrX5/n99991880366KLLlL79u31yCOP6PDhw2d83an06NFDrVq10qeffurddqbP2P/+97964oknJEmXXXaZ9zMtNzdXr7/+unr27KlmzZqpTZs2GjJkiDZv3pzvnH369NH27dv1888/n3XcAACcb/Rv6N/Qv6F/k+fo0aMyDCNf3yBP3759NXLkSG+y8lRTYRX23sjKytKjjz6q1q1b6+KLL9YLL7ygnJycU8YhScePH9f8+fM1cODAQpNfffr00R133KHY2Fjvti1btuj+++9Xhw4d1LRpU3Xq1EkvvPCCd+rY7t276+DBg1qwYEG+2A8dOqSRI0cqMTFRLVu21O233+5twzwpKSl6+OGHlZiYqHbt2unZZ5/VhAkT8r133W63PvroI/Xt21ctWrRQ165d9dprr8lut3vLjB49Wrfffruee+45tWnTRr1799aIESPUuXPnAu3+1FNPqVevXqdtJ6CsIYEBlDCbzaahQ4dq06ZNmjlzptq3b3/Ksr179/YOs86TlZWlZcuWqU+fPgXKjxkzRpMmTVK/fv309ttv64orrtBLL72Ub3jma6+9pmnTpumGG27QO++8o+eff15paWl68MEH8/3yP3LkiMaNG6fbbrtNM2bMUI0aNTRq1CjvkNevvvpKr776qgYNGqRZs2Zp+PDhWrRokZ5//vlT1ufXX3+VpFPOSx8UFKRnn332nKZGeemll7Rs2TKNGjXKexP9n//8R/Pnz1dMTIx3Gpj77rvP+/WqVas0ePBgBQcHa+LEiXryySe1cuVK3Xbbbfnmt3e73Zo9e7ZefPFFPfHEE6pfv36hMUyYMEEej0dvvPGGHn/8cf3000966aWXvPunTJmi1157TVdeeaWmTZumli1b6qGHHsp3jJiYGM2dO1fXXXfdKeuanZ2tm266SStWrNBjjz2mKVOmKCgoSHfccYf27NlT4Hgn171Tp06KiYnRokWL8pVduHCh6tSpo4suuuj0jX0KW7Zs0eDBgxUVFaUJEyborbfeUtu2bTVlyhR98803kk4M/Zaka6+91vv1vw0bNkxdunRRlSpVNHfuXHXt2rXI53/11Vc1depUXXvttZoyZYqioqL0+uuv5yuze/du3XjjjTp27JheeeUVvfjii9q/f79uuukmHTt2rNDj9ujRQ6Ghofr666/zbf/qq6/UsGFDNWnSxCfvi7Nt36LU+fjx47rlllt0+PBhjR8/Xk8//bSWLFmir776yhuPYRi699579c477+i6667zftZMnDixwJQD33zzjcLCwvTWW2/prrvu0rXXXqu//vpLe/fu9ZY5fPiwVqxYoQEDBpyyjb744gt17dq10GH
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1600x1200 with 4 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"from sklearn.preprocessing import StandardScaler\n",
|
|||
|
"from sklearn.cluster import KMeans\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"\n",
|
|||
|
"# Загрузка данных\n",
|
|||
|
"df = pd.read_csv(\"nuforc_reports.csv\")\n",
|
|||
|
"df = df.iloc[:1000].dropna()\n",
|
|||
|
"\n",
|
|||
|
"# Удаление несущественных столбцов\n",
|
|||
|
"columns_to_drop = ['summary', 'stats', 'report_link', 'posted', \"duration\"]\n",
|
|||
|
"df_cleaned = df.drop(columns=columns_to_drop)\n",
|
|||
|
"\n",
|
|||
|
"# Закодирование категориальных переменных\n",
|
|||
|
"df_encoded = pd.get_dummies(df_cleaned, drop_first=True)\n",
|
|||
|
"\n",
|
|||
|
"# Выбор подмножества данных для кластеризации\n",
|
|||
|
"features_used = ['city_latitude']\n",
|
|||
|
"data_to_scale = df_encoded[features_used]\n",
|
|||
|
"\n",
|
|||
|
"# Масштабирование данных\n",
|
|||
|
"scaler = StandardScaler()\n",
|
|||
|
"data_scaled = scaler.fit_transform(data_to_scale)\n",
|
|||
|
"\n",
|
|||
|
"# Кластеризация данных\n",
|
|||
|
"random_state = 42\n",
|
|||
|
"kmeans = KMeans(n_clusters=4, random_state=random_state)\n",
|
|||
|
"labels = kmeans.fit_predict(data_scaled)\n",
|
|||
|
"centers = kmeans.cluster_centers_\n",
|
|||
|
"\n",
|
|||
|
"# Отображение центроидов\n",
|
|||
|
"centers_original = scaler.inverse_transform(centers) # Обратная стандартизация\n",
|
|||
|
"print(\"Центры кластеров:\\n\", centers_original)\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация результатов кластеризации KMeans\n",
|
|||
|
"plt.figure(figsize=(16, 12))\n",
|
|||
|
"\n",
|
|||
|
"# Парный график 1: city_latitude vs Category\n",
|
|||
|
"plt.subplot(2, 2, 1)\n",
|
|||
|
"category_columns = [col for col in df_encoded.columns if col.startswith('category_')]\n",
|
|||
|
"if category_columns:\n",
|
|||
|
" sns.scatterplot(x=df_cleaned['city_latitude'], y=df_encoded[category_columns[0]], hue=labels, palette='Set1', alpha=0.6)\n",
|
|||
|
" plt.title('KMeans Clustering: city_latitude vs Category')\n",
|
|||
|
" plt.xlabel('city_latitude')\n",
|
|||
|
" plt.ylabel(f'Category ({category_columns[0]})')\n",
|
|||
|
"else:\n",
|
|||
|
" plt.title('KMeans Clustering: city_latitude vs Category (No Data)')\n",
|
|||
|
" plt.xlabel('city_latitude')\n",
|
|||
|
" plt.ylabel('state')\n",
|
|||
|
"\n",
|
|||
|
"# Парный график 2: city_latitude vs Sub-Category\n",
|
|||
|
"plt.subplot(2, 2, 2)\n",
|
|||
|
"city_columns = [col for col in df_encoded.columns if col.startswith('city_')]\n",
|
|||
|
"if city_columns:\n",
|
|||
|
" sns.scatterplot(x=df_cleaned['city_latitude'], y=df_encoded[city_columns[0]], hue=labels, palette='Set1', alpha=0.6)\n",
|
|||
|
" plt.title('KMeans Clustering: city_latitude vs Sub-Category')\n",
|
|||
|
" plt.xlabel('city_latitude')\n",
|
|||
|
" plt.ylabel(f'Sub-Category ({city_columns[0]})')\n",
|
|||
|
"else:\n",
|
|||
|
" plt.title('KMeans Clustering: city_latitude vs Sub-Category (No Data)')\n",
|
|||
|
" plt.xlabel('city_latitude')\n",
|
|||
|
" plt.ylabel('Sub-Category')\n",
|
|||
|
"\n",
|
|||
|
"# Парный график 3: city_latitude vs Category (другая категория)\n",
|
|||
|
"plt.subplot(2, 2, 3)\n",
|
|||
|
"if len(category_columns) > 1:\n",
|
|||
|
" sns.scatterplot(x=df_cleaned['city_latitude'], y=df_encoded[category_columns[1]], hue=labels, palette='Set1', alpha=0.6)\n",
|
|||
|
" plt.title('KMeans Clustering: city_latitude vs Category')\n",
|
|||
|
" plt.xlabel('city_latitude')\n",
|
|||
|
" plt.ylabel(f'Category ({category_columns[1]})')\n",
|
|||
|
"else:\n",
|
|||
|
" plt.title('KMeans Clustering: city_latitude vs Category (No Data)')\n",
|
|||
|
" plt.xlabel('city_latitude')\n",
|
|||
|
" plt.ylabel('state')\n",
|
|||
|
"\n",
|
|||
|
"# Парный график 4: city_latitude vs Sub-Category (другая подкатегория)\n",
|
|||
|
"plt.subplot(2, 2, 4)\n",
|
|||
|
"if len(city_columns) > 1:\n",
|
|||
|
" sns.scatterplot(x=df_cleaned['city_latitude'], y=df_encoded[city_columns[1]], hue=labels, palette='Set1', alpha=0.6)\n",
|
|||
|
" plt.title('KMeans Clustering: city_latitude vs Sub-Category')\n",
|
|||
|
" plt.xlabel('city_latitude')\n",
|
|||
|
" plt.ylabel(f'Sub-Category ({city_columns[1]})')\n",
|
|||
|
"else:\n",
|
|||
|
" plt.title('KMeans Clustering: city_latitude vs Sub-Category (No Data)')\n",
|
|||
|
" plt.xlabel('city_latitude')\n",
|
|||
|
" plt.ylabel('Sub-Category')\n",
|
|||
|
"\n",
|
|||
|
"# Настройка графиков\n",
|
|||
|
"plt.tight_layout()\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### PCA для визуализации сокращенной размерности"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 57,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAABjAAAAJICAYAAADPWa1BAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3zU9f0H8Nd33ExCEgKZEAIEwt57KKCiIoqziohIHSi2tlpbtSpi/TnauurAaqvVKhZUFBRRBEVBRfbeI0AYGYyQcfM7fn8cObjkLmTcTF7Px8NHy33u7vu+T+6S7+s+38/nI+i6roOIiIiIiIiIiIiIiCiKiJEugIiIiIiIiIiIiIiIqDoOYBARERERERERERERUdThAAYREREREREREREREUUdDmAQEREREREREREREVHU4QAGERERERERERERERFFHQ5gEBERERERERERERFR1OEABhERERERERERERERRR0OYBARERERERERERERUdThAAYRUTOi63qkS6AAmvPPpjm/diIiIiKqieeHzQN/zkRUFxzAICJMnjwZeXl5Pv/16NEDo0aNwpNPPonTp0/XeEx+fj5mzpyJiy++GL169cKoUaPwwAMPYOfOnQGP89JLLyEvLw9PPfVUKF9OQK+++iry8vIicmx/Pv30U+Tl5eHw4cMhf5zL5cIzzzyDL774or5l1stNN92EvLw8LF68OKTHibafZWOUlZXhT3/6E9auXeu9bfLkyZg8eXLYaqjr53nMmDF4+OGHg3rsPXv2YOLEiUF5rsOHDyMvLw+ffvppUJ6PiIiIogtzS2Q0pdySl5eHV199tcbtu3fvxtChQ3HhhRfiwIED3vvm5eXhxRdf9PtcmqZh5MiRMXv+WVRUhL/97W+47LLL0Lt3b4wYMQJ33323Ty4BQpNNCgsLcdddd+HIkSNBeb5AP1ciaho4gEFEAIBu3bph7ty53v/+85//4LbbbsO8efMwbdo0nysjvvnmG1xzzTXYtm0b7rnnHvzrX//C/fffjwMHDuBXv/oVfvrppxrPr2ka5s+fj86dO2PBggWw2+3hfHnNXnFxMd577z0oihKyY+zfvx8bNmxA586dMWfOnJAdp6nZsWMHFixYAE3TvLc98cQTeOKJJ8Jy/IZ8noPp66+/xoYNG4LyXKmpqZg7dy5GjRoVlOcjIiKi6MPc0rSFI7dUt2fPHtx2222wWCz44IMPkJOT420TRRFff/2138etWbMGxcXFYaoyuNatW4cJEyZg2bJluPXWW/HPf/4Tjz76KBwOByZPnoz58+eH9Pg///wzfvjhh6A939y5c3HDDTcE7fmIKLrIkS6AiKJDfHw8+vTp43PbwIEDUVlZiVdeeQWbNm1Cnz59cOjQITz00EMYOXIkXn75ZUiS5L3/2LFjMXHiRDz00EP47rvvYDQavW0//vgjCgsL8eKLL+KWW27BwoULeYLRxHz66afIysrCtGnT8OCDD+LgwYNo165dpMuKSbm5uWE5TkM/z9HKaDTW+D1GRERETQtzCwXTvn37MGXKFMTFxeG9995DZmamT3u/fv2wdu1abN++Hd26dfNp+/LLL9G1a1fs2LEjnCU3WmlpKX7/+98jJycH//nPf2CxWLxtl156Ke666y7MmDEDI0aMQKtWrSJYad0xAxA1bZyBQUS16tGjBwDg6NGjAID3338fLpcLjz32mE8IAACLxYKHHnoI1113XY3p2/PmzUPnzp3Rv39/DB48GHPnzj3vsceMGYNnnnkGU6ZMQa9evfDoo48C8JxwzZgxA8OGDUPPnj3xq1/9CitXrvR5rNPpxLPPPovhw4ejb9++eOSRR+B0On3u428q7KpVq5CXl4dVq1Z5b9u/fz9+85vfYNCgQRg4cCCmTZuGffv2+Rzrb3/7Gy688EL06NEDV155JRYtWuTzvJqmYdasWRg1ahR69+6N6dOn+53iXl1dH7d06VLcfPPN6Nu3L3r06IHLLrsMs2fPBuBZVueiiy4CADzyyCMYM2aM93Eff/wxrr32WvTp0we9evXChAkT8NVXX/k8d15e3nmXDVJVFfPnz8fo0aNx8cUXw2q1+v0Zu91uPP/887jgggvQq1cv3H777Zg/f36NqeWfffYZxo0bh549e+Kqq67CypUr0a1bt1qnZi9atAjXXnst+vbti+HDh2PGjBk+ffXqq6/isssuw5IlSzB+/Hj07NkTEyZMwIYNG7Bx40bccMMN6NWrF8aPH1/j/bR7925MmzYN/fr1Q79+/XDvvfeioKDA2171vpkzZw5Gjx6Nfv36ea/oq62PV61ahVtvvRUAcOutt3rfj+e+N3/961/j2muvrfF6p0+fjquuusr777Vr1+KWW25B7969MWjQIDz00EM4efJkwP4CGv55Pvc1n/tZqV47AGzduhVTpkxB//790bdvX9x2223YuHEjAM/P5LXXXgPgO+1b0zS89dZbuOSSS9CjRw9ceumleP/992sc58EHH8R9992HPn36YOrUqTWWkPr000/RrVs3bNq0CTfeeCN69uyJ0aNH4+233/Z5ruLiYtx///3ez/iMGTPw0ksv+XxWiIiIKLoxtzC31CW3nGvfvn249dZbkZCQgA8++KDG4AXgGRxr1apVjVkYiqLgm2++wRVXXFHjMXX5uZ88eRJPPvkkRo8ejR49emDQoEG49957fTLR5MmT8eijj+Ktt97CqFGj0LNnT9x0003YvHmz9z4OhwMzZ87EBRdc4O3P6ue61c2fPx/FxcX485//7DN4AXhmnDz44IOYNGkSKioqajw20JKtDz/8sM/P69ChQ7j77rsxePBg9O7dGzfeeKN3xsWnn36KRx55BABw0UUX+fzMPv74Y1xxxRXepeFeffVVqKrqc5wpU6bgiSeeQL9+/TBu3DioquqTJao+GytXrsSvf/1r9O7dG8OHD8ff//53n+eqqKjAjBkzMHToUPTt2xf3338/3n333ahavo2IPDiAQUS1ys/PBwC0bdsWALBixQp069YNaWlpfu8/dOhQ3H///WjdurX3ttLSUnz33Xe4+uqrAQDXXHMNtmzZgm3btp33+LNnz0bPnj0xa9YsXH/99XA6nZgyZQq+/fZb3H///XjttdeQnp6OO+64w+ek8I9//CM++ugjTJs2DS+//DJOnz6Nd999t96vv6ioCDfeeCMOHDiAmTNn4u9//zuOHz+OKVOmoLS0FLqu495778WcOXMwdepUvPHGG96Tn3On3f7973/H66+/juuvvx6vvfYakpKS8MILL5z3+HV53Pfff497770X3bt3x6xZs/Dqq6+ibdu2+Mtf/oJNmzYhNTXV+yXxPffc4/3/s2fPxowZM3DxxRfjzTffxPPPPw+j0YgHH3wQhYWF3uefO3cupk+fXmudy5cvR0lJCa6++mqYzWZcfvnl+Oyzz+ByuXzuN2PGDLz33nu45ZZb8Prrr6NVq1Z4/PHHfe4zf/58PPzww+jXrx9mzZqFSy+9FNOnT/c52axu1qxZeOCBB9CnTx+88soruPfee7F48WJMnjwZDofDe7/CwkI899xzuPvuu/GPf/wDZWVluO+++/DAAw/ghhtuwOuvvw5d13H//fd7H5efn4+bbroJJ06cwF//+lc8/fTTKCgowMSJE3HixAmfOl577TU89NBDmDFjBvr27XvePu7evTtmzJjh7Rt/y0ZdddVV2LZtGw4ePOi9raysDMuXL8eECRMAeKav33bbbTCbzXj55Zfx5z//GatXr8att97q8/qra8jnuT4qKipwxx13IDk5Ga+++ipeeukl2O123H777SgvL8cNN9yA66+/HoDvtO+ZM2filVdewVVXXYV//vOfuOyyy/DMM8/g9ddf93n+r776CnFxcXjjjTdwxx13+K1B0zT8/ve/x7hx4/DWW2+hX79++Nvf/oYVK1YA8KyzPGXKFKxfvx5//vOf8eyzz2Lnzp145513GvSaiYiIKDKYW5hb6pJbquzfvx9TpkxBfHw8Pvjgg4DvE0mScOmll9YYwFi5ciWcTmeNC17q8nPXdR3Tpk3DTz/9hAcffBBvv/0
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1600x600 with 2 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Применение PCA ТОЛЬКО к числовым данным\n",
|
|||
|
"pca = PCA(n_components=1)\n",
|
|||
|
"reduced_data = pca.fit_transform(data_scaled)\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация сокращенных данных\n",
|
|||
|
"plt.figure(figsize=(16, 6))\n",
|
|||
|
"\n",
|
|||
|
"# График 1: PCA reduced data: Agglomerative Clustering\n",
|
|||
|
"plt.subplot(1, 2, 1)\n",
|
|||
|
"sns.scatterplot(x=range(len(reduced_data)), y=reduced_data[:, 0], hue=result, palette='Set1', alpha=0.6)\n",
|
|||
|
"plt.title('PCA reduced data: Agglomerative Clustering')\n",
|
|||
|
"plt.xlabel('Sample Index')\n",
|
|||
|
"plt.ylabel('Principal Component 1')\n",
|
|||
|
"\n",
|
|||
|
"# График 2: PCA reduced data: KMeans Clustering\n",
|
|||
|
"plt.subplot(1, 2, 2)\n",
|
|||
|
"sns.scatterplot(x=range(len(reduced_data)), y=reduced_data[:, 0], hue=labels, palette='Set1', alpha=0.6)\n",
|
|||
|
"plt.title('PCA reduced data: KMeans Clustering')\n",
|
|||
|
"plt.xlabel('Sample Index')\n",
|
|||
|
"plt.ylabel('Principal Component 1')\n",
|
|||
|
"\n",
|
|||
|
"plt.tight_layout()\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### Анализ инерции для метода локтя (метод оценки суммы квадратов расстояний)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 58,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA1YAAAImCAYAAABQCRseAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAB5nElEQVR4nO3dd3hUVf7H8c/MZNILpIdeQqgpIAFxQRH5sRZ0RSyrYkFQFnFZRUVR1hV3LYsUKaIiiBVBBVGXVddeEBCUnkAIEKSF9EZ6Mr8/QkaGUEIS5iaT9+t58iRz77l3vndy1s2Hc+65JpvNZhMAAAAAoM7MRhcAAAAAAE0dwQoAAAAA6olgBQAAAAD1RLACAAAAgHoiWAEAAABAPRGsAAAAAKCeCFYAAAAAUE8EKwAAAACoJ4IVAAAAANQTwQoAAAAA6olgBQBncdttt6lr167685//fNo2DzzwgLp27apHH33UiZUBqKuDBw+qa9euWrlypdGlAHARBCsAqAWz2azNmzcrNTW1xr7CwkJ98803BlQFAAAaC4IVANRCjx495OHhoc8++6zGvm+++UZeXl4KCwszoDIAANAYEKwAoBa8vb11ySWXnDJY/fe//9Uf//hHubm51dj35Zdf6rrrrlN0dLT+8Ic/6F//+pcKCwslSUOGDFHXrl1P+XXw4EFJ0po1a3TLLbfoggsuUP/+/fXggw/qyJEjDu/x4IMPnvIcZ5viVD3F8VRfJ9q2bZvGjBmj/v37q0+fPvrLX/6i3bt32/evX79eXbt21fr16yVJSUlJGjp0qP785z9r3rx5p32PefPmSZLef/99XXHFFerVq5fD/rNNq3zvvfdOed4Tj6ue7nW2dnWtobafzZne/3T7q38Pjz76qIYMGeLwvsuWLXP4DE98n19++cWh7dtvv62uXbs6nKO4uFgzZ87UsGHD1KtXL/Xp00ejR49WYmKiw7Gnq+u2225zaFNdx6mc3D+q3XbbbQ7nKSkp0YsvvqjLL79c0dHRGjZsmBYuXKjKykqHY06uZf369bU69mxsNpumTJmimJgY/fjjj7U+DgCq1fwrAABwSldeeaXuv/9+paamKjw8XJJUUFCg77//XkuWLNH333/v0P6TTz7RQw89pKuvvlr333+/Dh06pNmzZys5OVlLlizR/PnzVVpaqvT0dN13330aP368Bg8eLEkKDQ3VqlWr9Mgjj2j48OEaN26csrOzNXfuXN1000368MMPFRQUJKnqD9KbbrpJ1113nSTZz1cbPXr00D/+8Q/76/fff18ffPCB/fW6des0duxY9e/fX88884xKSkr0yiuv6M9//rPee+89de7cucY5n3/+efXq1Uvjx49XQECABg0aJEmaNm2aJNnfLzw8XBs2bNDUqVN1/fXXa+rUqfLx8ZGkWtVfXFys6OhoTZ061b7tdMed+Nme3K6uNZzLZ/PEE0+oZ8+ep3z/5cuXS5J27Nihp556qkbbk+Xm5uqFF1445T4fHx99/fXXuuCCC+zb/vvf/8psdvx31MmTJ2vjxo2aNGmS2rVrp/3792vOnDl68MEHtXr1aplMJnvb66+/XjfccIP9dfXvsSHZbDb95S9/0ebNm3XfffepW7duWr9+vV544QUdOHBA//znP+1tT+6znTt3rvWxZ/Kvf/1L//nPf/Tiiy9q4MCBDX6NAFwfwQoAamnw4MHy8vLSZ599pjvvvFOS9MUXXygoKMjhD1mp6g/FGTNmaNCgQZoxY4Z9e4cOHXTnnXfqu+++s/+hXz061a5dO8XFxUmSKisrNWPGDA0cOFAzZ860H9+nTx9deeWVWrx4sSZPnixJKioqUocOHezHVp+vNnx9fe3HSdIPP/zgsH/mzJlq3769Fi5cKIvFIkkaOHCg/u///k9z587VnDlzHNrv379fP/74oz7++GN16dJFkuwh1NfXV5Ic3m/16tWSpMcee8weaCTJ3d39rLUXFRUpODjY4XynO+7Ez/bkdlu3bq1TDefy2URGRp72/au3l5SUnLLtyebOnatWrVopOzu7xr6LL75YX331lR5++GFJUmpqqjZt2qS+ffvq0KFDkqTS0lIdO3ZMU6dO1ZVXXilJ6tevnwoKCvTcc88pIyNDISEh9nOGh4c71FP9e2xI33//vX766SfNmjVLV111lSTpD3/4gzw9PTVnzhzdfvvt9v50cp/97rvvan3s6cycOVPLly/X/PnzdfHFFzf49QFoHpgKCAC15OnpqSFDhjhMB1y9erWuuOIKh3/hl6S9e/cqNTVVQ4YMUXl5uf0rPj5evr6+WrNmzRnfa9++fUpPT9fw4cMdtrdr1069e/fWzz//bN925MgR+fn5NcAVOiosLNS2bdt0xRVX2IODJPn7++vSSy91qKG6/ezZs9W/f/+z/iFbLSYmRpL02muvKS0tTaWlpSovL6/VsQ113XWp4Vw/m4aSlJSk5cuX6+9///sp9w8ZMkQpKSnau3evJOmzzz5TbGysWrdubW/j7u6uxYsX68orr9TRo0e1bt06LVu2zL4AS2lp6TnXVVlZqfLyctlstrO2qf46se3PP/8sNzc3XX755Q7HXHPNNfb9p1OfYyXpnXfe0cKFC3XVVVc5jGoCwLlixAoAzsEVV1yh++67T6mpqfLw8NDatWt1//3312iXk5MjqWra1KmmTqWlpZ3xfaqPDw4OrrEvODhYCQkJkqpGxg4fPqw2bdqc24XUQn5+vmw222lryM/Pd9j2l7/8Rf7+/g5TCc8mPj5eU6dO1cKFCzV//vxzqu/QoUNnnDJ3Pms418+mofzrX//SVVddpd69e59yf1hYmHr16qWvvvpKnTp10n//+18NHz7c3l+q/fDDD3rmmWe0d+9e+fj4qFu3bvL29pakM4aj01mwYIEWLFggi8Wi4OBgDRw4UH/7298cFnSpHuU9Ub9+/SRVTW9s2bKlQ0iVZB85O9PnWZ9jJWnnzp0aOHCg/vOf/+iOO+5Qjx49ztgeAE6HYAUA5+Diiy+Wj4+PPvvsM3l7e6tNmzbq1atXjXb+/v6Squ5lqf7j8UQBAQFnfJ8WLVpIkjIyMmrsS09PV8uWLSVJiYmJKi4urrHgREPw8/OTyWQ6bQ3VNVabPHmyPvvsM02cOFHvvPNOraeM3Xjjjfrxxx9VXl6uJ554Qm3atNH48ePPeExlZaW2bNmikSNH1uo9Th5RrG8N5/rZNIRPP/1U27dvd5gaeiqXXXaZvvrqK11xxRXavn275s+f7xCsfvvtN02YMEFDhw7VK6+8orZt28pkMumdd96pMRVUOvtnJ1V9fjfeeKMqKyt1+PBhzZ49W3fffbc+/vhje5tp06Y5BOET75MKCAhQdna2KioqHAJS9T9AVPf3U6nPsZL0t7/9TbfffruuuuoqTZ06Ve+//36NkAYAtcFUQAA4B+7u7ho6dKg+//xzffrpp/Z7Ok7WqVMnBQUF6eDBg4qOjrZ/hYWFaebMmTVGEE7WsWNHhYSE6D//+Y/D9gMHDmjz5s3q06ePJOnbb79V9+7dFRgYeM7XUllZecY/IL29vdWrVy99+umnqqiosG/Pz8/Xt99+W+O+sl69emn+/Pk6dOiQnn/++VrXMWfOHH377bd67rnndMUVVyg6Ovqs9zf9+uuvKiwsVP/+/c/Yrnr05eTFG+pbw7l+NvVVWlqq6dOna8KECQ73P53K0KFDtWXLFr399tu64IILFBoa6rB/+/btKikp0T333KN27drZg1N1qKr+zKpX1DvbZydVLbYSHR2t2NhYXXHFFbr11lu1a9cu5ebm2tt07NjR4X8LJ97P1q9fP5WXl9dYdbM6mJ3p86zPsVLVCKOnp6eeeOIJ7dixQ0uWLDnr9QLAqTBiBQDn6Morr9S4ceNkNpsdVqQ7kcVi0QMPPKAnnnhCFotFl156qfLy8rRgwQIdPXr0rFPYzGazJk2apClTpujBBx/UNddco+zsbM2fP18BAQEaPXq0duzYoXfeeUdXXXWVNm/ebD82PT1dUtXIRFZWVo3QlZWVpeTkZO3fv98e0E7nwQcf1JgxY3TPPffolltuUVlZmRY
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Список для хранения инерций\n",
|
|||
|
"inertias = []\n",
|
|||
|
"clusters_range = range(1, 11)\n",
|
|||
|
"\n",
|
|||
|
"# Вычисление инерции для каждого количества кластеров\n",
|
|||
|
"for i in clusters_range:\n",
|
|||
|
" kmeans = KMeans(n_clusters=i, random_state=random_state)\n",
|
|||
|
" kmeans.fit(data_scaled)\n",
|
|||
|
" inertias.append(kmeans.inertia_)\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация метода локтя\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"plt.plot(clusters_range, inertias, marker='o')\n",
|
|||
|
"plt.title('Метод локтя для оптимального k')\n",
|
|||
|
"plt.xlabel('Количество кластеров')\n",
|
|||
|
"plt.ylabel('Инерция')\n",
|
|||
|
"plt.grid(True)\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### Расчет коэффициентов силуэта"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 59,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA1oAAAImCAYAAABKNfuQAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAADAqElEQVR4nOzdd3iT5foH8G+SJt170xbaUronljJbprgX4kABPR7A40I9R1DUnxNFZXkAQVRQUTlyFERQ9KCI7FGghdI9KNC9d9OR5PdH2khpgaYkfZPm+7muXsL7vnlz57EtufM8z32LVCqVCkRERERERKQzYqEDICIiIiIiGmiYaBEREREREekYEy0iIiIiIiIdY6JFRERERESkY0y0iIiIiIiIdIyJFhERERERkY4x0SIiIiIiItIxJlpEREREREQ6xkSLiIiIiIhIx5hoEZHJmDVrFmbNmtXl2IkTJ3DnnXciNDQU3333nV6f/6WXXsKkSZO0ftykSZPw0ksv6SEiItKXoKAgrF69WugwiEhAZkIHQEQklMrKSvzjH/9AWFgYNmzYgKCgIKFDIiIiogGCiRYRmazPP/8ccrkcH3zwAdzd3YUOh4iIiAYQLh0kIpNUXV2NzZs344477uiWZOXn52P+/PkYO3YsoqOjMWvWLJw8ebLLNX/++SemTZuGqKgojBkzBq+//jrq6+u7XPPNN99g4sSJiIqKwvPPP4+GhgYAwLp16zB69GjExsbi9ddfR2trq+Yxra2tePPNNzFixAiMHDlSs/SosbERCxYsQHR0NMaPH49vvvlG85iCggIEBQVh27ZtmmMtLS2YPHlyl1m6npZOHjt2DEFBQTh27FiPfwfUM3+xsbHdlj1+9913uO222xAeHo4JEyZg9erVUCgUmvM9LZW8NNbO5+rpqzPOay2b7Ok1Xa6srAwvvvgiRo8ejZiYGMycORNJSUma85cv8VKpVHjwwQcRFBSEgoKCLtddLdb58+cjISEBSqWyy/O/8soruOmmmwAAJSUl+Oc//4lRo0YhKioKs2bNQnJyMgBg9erVV3yOzvgyMjLw9NNPY9SoUQgLC0N8fDwWL14MuVx+1TE4dOjQVWPv7WsEgD179uCee+5BVFTUVe91qW3btiEoKAinT5/GPffcg8jISNxxxx349ddfu1xXUFCAhQsXYty4cQgLC8Po0aOxcOFCVFdXa65JT0/Hww8/jJiYGEyZMgXffvut5lxP379A9++Tay3ru/T7btOmTd1+vo4ePYrg4GB89NFHV7zH5VatWoWQkBD88MMPvX4MERk3zmgRkUlRqVQoLi7G4sWL0d7ejscff7zL+ZycHNx///3w9fXFq6++CqlUik2bNuGRRx7Bxo0bERcXh8TERDzxxBO488478a9//QvZ2dn48MMPkZWVha+//hoSiQS///473nrrLcyaNQsJCQnYsmULfv/9dwDArl27sHjxYhQWFmLZsmWwsLDAokWLAABLly7F1q1bsXDhQnh4eGDlypUoLCxEYWEhbr75ZqxatQr79+/HW2+9BQ8PD0yePLnH1/nZZ591SRKux/Lly1FfXw87OzvNsfXr12PlypWYOXMmFi1ahPT0dKxevRrFxcV49913e3XfsLAwbNmyBYA6afv+++81f7exsdFJ7I2NjZgxYwYUCgUWLFgAd3d3bNy4EY899hh++OEH+Pr6dnvMjz/+2CURu9T06dNx3333af7+5ptvdjn3v//9D8eOHcPo0aMBAHK5HL/++ivmzp2L1tZWzJkzB21tbXj99dchlUqxdu1azJo1C//9739x3333IT4+vst9X3/9dQCAh4cHysrK8PDDDyM6OhrvvfceZDIZ9u/fj88//xxubm6YN2/eFcdBLpfDw8MD//73v3uMvbev8cKFC3j22WcRHx+P559/XvM9caV7Xe7xxx/HzJkz8fzzz+P777/Hc889h/Xr12P8+PFobm7G7Nmz4ejoiNdffx22trZISkrCmjVrYGFhgbfeegvNzc2YO3cuvLy8sHr1apw6dQqvv/46Bg0ahISEhF7FoK1Zs2Zh9+7deP/99zFhwgTIZDK8/PLLiI6Oxj/+8Y9e3WPDhg1Yu3YtFi9ejHvuuUcvcRKR4WGiRUQmJTExERMmTIBUKsWnn37a7Y32mjVrIJPJsGnTJs2b/QkTJuD222/HBx98gO+//x7bt2+Hr68vlixZArFYjLFjx8LS0hKvvfYa9u3bh0mTJuHjjz/GyJEj8eqrrwIARo4cibFjx6K+vh5LlixBeHg4AKCurg6ffvopnnzySSiVSmzZsgXz5s3DzJkzAQAuLi544IEH4ODggGXLlkEqlSIhIQFZWVlYv359j4lWcXExPv30U4SFhSE1NfW6xislJQU//vgjQkJCUFdXBwCor6/H2rVr8cADD2he37hx4+Dg4IBXX30Vf/vb3zBs2LBr3tvGxgbR0dEAgAMHDgCA5u+68sMPP6CwsBA//PADQkJCAADDhw/H3XffjcTExG7//xsbG7Fs2bIrjp2Hh0eXGC9NCMeNGwcPDw9s375dk2j99ttvaGpqwt13343k5GTk5eXhm2++QUxMjCaWG2+8EWvXrsXq1avh4eHR5b6XPtfBgwcREhKCf//735rzY8aMwaFDh3Ds2LGrJlrNzc2ws7O7Yuy9fY1paWloa2vD888/j8DAwGve63KzZs3CU089BQCIj4/HPffcg48++gjjx49Hfn4+PDw88P7778PHxwcAMGrUKJw+fRrHjx8HABQWFiIiIgIvv/wyfHx8MG7cOGzevBkHDhzQW6IlEomwZMkS3HnnnVi6dCkkEglqamrw5ZdfQiKRXPPx//nPf7B06VK89dZbmD59ul5iJCLDxKWDRGRSQkND8d5778He3h6LFi3qNutz/PhxTJw4scsbRzMzM9x22204e/YsGhsb8c4772D79u0Qi8Vob29He3s7brrpJojFYiQmJqK9vR1paWkYN26c5h7m5uaIioqCpaWlJskC1G/O5XI5MjMzkZmZiZaWFs2sBqB+o21ubo7IyEhIpdIuj0tNTe2yVK/T+++/j9jYWEycOPG6xkqlUmHx4sWYPn06goODNceTkpIgl8sxadIkzetvb2/XLBM8dOhQl/tces3ly+p6G0dfH3vy5El4e3trkiwAsLS0xP/+978uszad1q5dC0dHR8yYMUPr5xKLxbjnnnuwe/duNDc3A1AnemPGjIGHhwfi4uKQnJyM6OhoKBQKtLe3w87ODmPHjkViYuI17z9u3Dh8/fXXMDc3R05ODvbs2YN169ahqqqqy/LTnhQXF8PW1lbr13S5sLAwmJmZ4euvv0ZhYSFaW1vR3t4OlUrVq8dfOpsjEolw44034syZM5DL5QgJCcHmzZvh5eWF/Px87Nu3Dxs2bEBeXp7m9QUEBGDdunXw8fFBa2sr9u/fj9raWgwdOrTL8yiVyi7fdz3F13lNb2L38fHBCy+8gB9++AHfffcdXn31VU0yeDV79+7Fm2++idjYWNx///3XvJ6IBhbOaBGRSbGxscE999wDf39/zJgxA8899xy2bNmi+WS6trYWLi4u3R7n4uIClUqFhoYGWFtbw9zcHID6jeel6urqUFlZCYVCAUdHxy7nHBwcYG9v3+VY59KriooKTdJ0+ePs7e3h4ODQ7XHt7e1d9q4A6kTx999/x44dO/Dzzz/3ZkiuaPv27cjPz8fHH3+M999/X3O8pqYGAK44g1JWVqb5c2FhYbcx6ksc27dvh0gkgrOzM2644QY8++yz3d5c96SmpgbOzs69ep78/Hx8+eWX+Oyzz1BUVNSnWO+99158/PHH2L17N0aNGoUjR45g2bJlmvMymQyAet/WpXt1ejMzolQqsWLFCnzzzTdoamqCp6cnIiMjNd+LV1NYWAgvL68+vKKufHx8sHTpUqxYsUKzzLNTXFzcNR/v5ubW5e/Ozs5QqVSoq6uDhYUFPv/8c3z88ceoqamBi4sLwsPDYWlp2W3/Y11dHUaMGAEAcHV1xS233NLl/KOPPtrtuS+Pb+3atVi7di0kEglcXFwwbtw4PPvss1csjHP
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Список для хранения коэффициентов силуэта\n",
|
|||
|
"silhouette_scores = []\n",
|
|||
|
"\n",
|
|||
|
"# Вычисление коэффициентов силуэта для каждого количества кластеров\n",
|
|||
|
"for i in clusters_range[1:]: \n",
|
|||
|
" kmeans = KMeans(n_clusters=i, random_state=random_state)\n",
|
|||
|
" labels = kmeans.fit_predict(data_scaled)\n",
|
|||
|
" score = silhouette_score(data_scaled, labels)\n",
|
|||
|
" silhouette_scores.append(score)\n",
|
|||
|
"\n",
|
|||
|
"# Построение диаграммы значений силуэта\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"plt.plot(clusters_range[1:], silhouette_scores, marker='o')\n",
|
|||
|
"plt.title('Коэффициенты силуэта для разных k')\n",
|
|||
|
"plt.xlabel('Количество кластеров')\n",
|
|||
|
"plt.ylabel('Коэффициент силуэта')\n",
|
|||
|
"plt.grid(True)\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 60,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Средний коэффициент силуэта: 0.544\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA04AAAJzCAYAAAA4M0NGAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3iT5foH8G/2bJvuvfduaQtl740gOBAE5ChHPE5UXD/3UTkOcAGK4ygqIggIsvfelNECLQW6995Js39/9DQSk3TQpknb+3NdXtL3SfLeyZvx3u/zPPfD0Gq1WhBCCCGEEEIIMYlp6QAIIYQQQgghxNpR4kQIIYQQQggh7aDEiRBCCCGEEELaQYkTIYQQQgghhLSDEidCCCGEEEIIaQclToQQQgghhBDSDkqcCCGEEEIIIaQdlDgRQgghhBBCSDsocSKEEEIIIYSQdlDiRAi5a/Pnz0doaKjef4mJiViwYAHOnz9v6fAIIX1caGgoVq5cabD95s2bGDx4MEaOHInc3FyT91+5ciVCQ0MRHR2NxsZGo7f57bffEBoaijFjxnRX2ISQXooSJ0JIl0RERGDjxo3YuHEj1q9fjw8//BAcDgePPfYYbt26ZenwCCH9zK1bt7Bw4UIIBAKsW7cOfn5+7d5HpVLh8OHDRtt2797dzRESQnorSpwIIV0iFosRFxeHuLg4JCQkYNy4cVi5ciWYTCb++OMPS4dHCOlHsrKy8Mgjj0AkEmHdunXw9vbu0P0GDBiAPXv2GGwvKytDSkoKwsPDuztUQkgvRIkTIaTbCQQC8Hg8MBgM3bb58+dj/vz5erdbsWIFQkND9RKsdevWYezYsYiPj8e8efNw8+ZNAMCvv/6K0NBQ5OTk6D3Gn3/+ifDwcJSUlAAADh48iLlz5yI+Ph5RUVGYNGkSfv31V737vPrqqwZDDFv/Kyws1N3m70NzNmzYYDA0aPfu3ZgyZQri4uIwa9YspKSk6N2nvXjOnTuH0NBQnDt3Tu9+f3+9OvL6KRQKfPTRRxg5ciTCw8P1nldbSezfH/uDDz5AdHQ0jh8/DuCv4UzG/rsz7o689uXl5XjllVcwePBg3TG+fPkyAGDMmDHtHpeUlBTMmzcPsbGxGDhwIF555RVUV1frHv+PP/5AaGgoUlNTMXPmTMTExOCee+7B3r179eJoaGjAf/7zH4wbNw7R0dGYNm0aNm/erHebO+MJCwtDUlISnnnmGdTU1Jh8LQEgOzsbTz/9NAYOHIikpCQsXrwYWVlZJm/f1ut753HLzc3Fs88+i6FDhyIuLg7z58/HxYsXde2FhYW6+23fvl1vH0eOHNG13Wn37t2YNWsW4uPjMXToULz11luoq6sziO1Oxt6LY8aMwauvvmry779rjfXO53fp0iXMnj0b0dHRGDp0KN577z00NzebfIy/y8rKwoIFC2BjY4N169bBw8Ojw/edMmUKTp48aTBcb+/evfD390dYWJjBfQ4ePIhZs2bp4n3//fchlUoNbtORz/+ZM2fw6KOPIjY2FkOHDsUnn3wCtVqtu92pU6fw4IMPIj4+HklJSfjXv/7V5nuKEGIelDgRQrpEq9VCpVJBpVJBqVSioqICK1asgEKhwH333Wfyfvn5+Vi7dq3etv379+O9997D1KlTsXr1aqjVajzxxBNQKBS45557wOPx8Oeff+rdZ9u2bRg8eDDc3d1x9OhRPPXUU4iMjMRXX32FlStXwtvbG//+97+Rmpqqdz9nZ2fdEMONGzfiX//6V5vPs66uDp9//rnetrS0NCxduhRxcXH4+uuv4e7ujieeeAKVlZUA0Kl4OsvY6/fdd9/hp59+wiOPPIKffvoJGzduxKpVqzr1uGlpafjtt9/w+eefIz4+Xq/tztfrrbfe0mvryHNtamrCnDlzcO7cObz00ktYtWoVeDweHn30UeTm5mLVqlV6Mf/rX//S7c/FxQUXLlzAwoULwefz8fnnn+P//u//cP78eSxYsMDgBHvx4sUYO3YsVq1aBX9/fyxZsgTHjh0DADQ3N2Pu3LnYsWMHFi1ahK+++goJCQl4/fXXsWbNGr3HGTlyJDZu3IhffvkFL774Ik6dOoUPPvjA5OtXVlaG2bNnIzc3F++88w4++eQTVFZW4pFHHkFtbW2br/2dr+/fj9vt27cxa9YsFBYW4o033sDy5cvBYDDwyCOPGMwnFIlEBsPOdu/eDSZT/yf/q6++wgsvvIC4uDh8+eWXeOqpp7Bv3z7Mnz+/UwlLdygpKcFjjz0Ge3t7rFq1Cs8++yz+/PNPvPzyyx26f3Z2Nh555BGIxWKsW7cOrq6undr/xIkToVarjb5uU6dONbj9jh078NRTTyEgIACrV6/G008/je3bt+PJJ5+EVqsF0LnP/9KlS5GQkIA1a9Zg2rRp+P7777Fp0yYAQEFBAZ588klERUXh66+/xgcffICcnBw8/vjj0Gg0nXqehJCuYVs6AEJI73bhwgVERkYabH/hhRcQGBho8n7Lli1DcHAwrl+/rttWXV2NuXPn4oUXXgDQ0oPSerU+PDwc48ePx/bt2/Hcc8+BwWCgtLQUZ8+exSeffAKg5eRy5syZeP3113WPGR8fj0GDBuHcuXOIjY3VbedyuYiLi9P9nZ2d3ebz/PLLL+Hh4aHX21BaWoqJEyfi/fffB5PJhJOTE6ZNm4YrV65g3LhxnYqns4y9fmlpaQgLC8Ojjz6q29baU9NRrT1+Y8eONWi78/WSy+V6bR15rlu3bkVRURG2bt2qG/o0YMAA3Hvvvbhw4QIeeOABvZh9fHz09rlixQr4+/vjm2++AYvFAgDExsZi6tSp2LJlCx5++GHdbefPn4+nnnoKADB8+HDMnDkTq1evxsiRI/HHH3/g5s2b2LBhgy45HD58OFQqFb766is89NBDkEgkAAAHBwddDElJSTh9+rTea/53a9euhUKhwI8//ghnZ2cAQFhYGObMmYPU1FSMHDnS5H3vfK5/P26rVq0Cl8vFzz//DLFYDAAYNWoUpk2bho8//livt2zEiBE4ceIEFAoFuFwu5HI5Dh06hKSkJF0PYV1dHb7++ms8+OCDeklwSEgIHn74YYPX09y+++472NvbY/Xq1bpjy2Qy8cYbbyAzM9Og1+tOubm5WLBgASorK6FUKu8qmXByckJSUhL27NmD6dOnAwCKioqQmpqKjz/+GF9//bXutlqtFsuXL8fw4cOxfPly3XY/Pz8sXLgQx44dw6hRozr1+X/ggQd079fBgwfj4MGDOHr0KB566CGkpaWhubkZixcv1iWEbm5uOHToEKRSqe79QAgxP0qcCCFdEhkZiXfffRdAywlFfX09jh8/js8++wxSqRTPP/+8wX2OHz+O06dP47vvvsOCBQt02x966CEAgEajgVQqxf79+8Hn8+Hp6QkAuP/++7Fz506kpKQgKSkJ27Ztg0gkwvjx4wEAixYtAtDSs5GTk4P8/HxcvXoVQEsSdrdu3ryp63VojREAJkyYgAkTJkCr1UIqlWLPnj1gMpnw9/c3azymXr/o6Gh8++232LdvH5KTkyESiTp8EqnVanH58mXs3r3boCerIzryXC9evAgvLy+9+SICgQD79u1r9/FlMhlSU1Px2GOP6Xo5AcDb2xuBgYE4deqU3on+zJkzdf9mMBgYP348Vq5ciebmZpw/fx6enp4GPWrTp0/H5s2b9RKc1n1pNBrcuHEDFy9exJAhQ0zGefHiRcTFxemSJqDlJPfIkSPtPse2nD9/HqNHj9Y7SWaz2bre2aamJt325ORkHD9+HOfOncPw4cNx/PhxiMViJCYm6hKnK1euQKFQYNq0aXr7SUxMhKenJ86fP9/lxKn1tWMymQa9Xa00Gg1UKhVSUlIwbNgwXdIEtCSAQMtr2lbitHPnTkRFReGzzz7Do48+ipdeeglr167V26dardb1BAEt74k79wW0DNd7//330djYCLFYjF27diEyMhK+vr56t8vOzkZpaSkWL16sex8CLYm1WCzGqVOnMGrUqE59/v/+XnRzc9MN+4uNjQWPx8P999+PSZMmYcSIERg0aBBiYmJMviaEEPO
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x700 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Добавляем индекс строки как дополнительный признак\n",
|
|||
|
"data_scaled_with_index = np.hstack((data_scaled, np.arange(data_scaled.shape[0]).reshape(-1, 1)))\n",
|
|||
|
"\n",
|
|||
|
"# ========================\n",
|
|||
|
"# Применение K-Means\n",
|
|||
|
"# ========================\n",
|
|||
|
"kmeans = KMeans(n_clusters=3, random_state=42) \n",
|
|||
|
"df_clusters = kmeans.fit_predict(data_scaled)\n",
|
|||
|
"\n",
|
|||
|
"# ========================\n",
|
|||
|
"# Оценка качества кластеризации\n",
|
|||
|
"# ========================\n",
|
|||
|
"silhouette_avg = silhouette_score(data_scaled, df_clusters)\n",
|
|||
|
"print(f'Средний коэффициент силуэта: {silhouette_avg:.3f}')\n",
|
|||
|
"\n",
|
|||
|
"# ========================\n",
|
|||
|
"# Визуализация кластеров\n",
|
|||
|
"# ========================\n",
|
|||
|
"pca = PCA(n_components=2)\n",
|
|||
|
"df_pca = pca.fit_transform(data_scaled_with_index)\n",
|
|||
|
"\n",
|
|||
|
"plt.figure(figsize=(10, 7))\n",
|
|||
|
"sns.scatterplot(x=df_pca[:, 0], y=df_pca[:, 1], hue=df_clusters, palette='viridis', alpha=0.7)\n",
|
|||
|
"plt.title('Визуализация кластеров с помощью K-Means')\n",
|
|||
|
"plt.xlabel('Первая компонента PCA')\n",
|
|||
|
"plt.ylabel('Вторая компонента PCA')\n",
|
|||
|
"plt.legend(title='Кластер', loc='upper right')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Средний коэффициент силуэта, равный 0.678, указывает на хорошую кластеризацию. \n",
|
|||
|
"\n",
|
|||
|
"Средний коэффициент силуэта (silhouette score) указывает на качество кластеризации, измеряя, насколько хорошо точки внутри одного кластера близки друг к другу по сравнению с точками из других кластеров. Значения коэффициента силуэта находятся в диапазоне от -1 до 1:\n",
|
|||
|
"\n",
|
|||
|
"1: Указывает на идеально плотные и четко разделенные кластеры. \n",
|
|||
|
"0: Указывает на перекрытие кластеров или слабую структуру кластеризации. \n",
|
|||
|
"Отрицательные значения: Указывают, что точки в кластере расположены ближе к другому кластеру, чем к своему."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 61,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"ename": "NameError",
|
|||
|
"evalue": "name 'AgglomerativeClustering' is not defined",
|
|||
|
"output_type": "error",
|
|||
|
"traceback": [
|
|||
|
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
|||
|
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
|
|||
|
"\u001b[1;32me:\\System\\Memory\\Interface\\texts\\university\\MII\\1\\1\\lab5.ipynb Cell 31\u001b[0m line \u001b[0;36m7\n\u001b[0;32m <a href='vscode-notebook-cell:/e%3A/System/Memory/Interface/texts/university/MII/1/1/lab5.ipynb#X42sZmlsZQ%3D%3D?line=1'>2</a>\u001b[0m data_scaled_with_index \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39mhstack((data_scaled, np\u001b[39m.\u001b[39marange(data_scaled\u001b[39m.\u001b[39mshape[\u001b[39m0\u001b[39m])\u001b[39m.\u001b[39mreshape(\u001b[39m-\u001b[39m\u001b[39m1\u001b[39m, \u001b[39m1\u001b[39m)))\n\u001b[0;32m <a href='vscode-notebook-cell:/e%3A/System/Memory/Interface/texts/university/MII/1/1/lab5.ipynb#X42sZmlsZQ%3D%3D?line=3'>4</a>\u001b[0m \u001b[39m# ========================\u001b[39;00m\n\u001b[0;32m <a href='vscode-notebook-cell:/e%3A/System/Memory/Interface/texts/university/MII/1/1/lab5.ipynb#X42sZmlsZQ%3D%3D?line=4'>5</a>\u001b[0m \u001b[39m# Агломеративная кластеризация\u001b[39;00m\n\u001b[0;32m <a href='vscode-notebook-cell:/e%3A/System/Memory/Interface/texts/university/MII/1/1/lab5.ipynb#X42sZmlsZQ%3D%3D?line=5'>6</a>\u001b[0m \u001b[39m# ========================\u001b[39;00m\n\u001b[1;32m----> <a href='vscode-notebook-cell:/e%3A/System/Memory/Interface/texts/university/MII/1/1/lab5.ipynb#X42sZmlsZQ%3D%3D?line=6'>7</a>\u001b[0m agg_cluster \u001b[39m=\u001b[39m AgglomerativeClustering(n_clusters\u001b[39m=\u001b[39m\u001b[39m3\u001b[39m) \n\u001b[0;32m <a href='vscode-notebook-cell:/e%3A/System/Memory/Interface/texts/university/MII/1/1/lab5.ipynb#X42sZmlsZQ%3D%3D?line=7'>8</a>\u001b[0m labels_agg \u001b[39m=\u001b[39m agg_cluster\u001b[39m.\u001b[39mfit_predict(data_scaled)\n\u001b[0;32m <a href='vscode-notebook-cell:/e%3A/System/Memory/Interface/texts/university/MII/1/1/lab5.ipynb#X42sZmlsZQ%3D%3D?line=9'>10</a>\u001b[0m \u001b[39m# ========================\u001b[39;00m\n\u001b[0;32m <a href='vscode-notebook-cell:/e%3A/System/Memory/Interface/texts/university/MII/1/1/lab5.ipynb#X42sZmlsZQ%3D%3D?line=10'>11</a>\u001b[0m \u001b[39m# Оценка качества кластеризации\u001b[39;00m\n\u001b[0;32m <a href='vscode-notebook-cell:/e%3A/System/Memory/Interface/texts/university/MII/1/1/lab5.ipynb#X42sZmlsZQ%3D%3D?line=11'>12</a>\u001b[0m \u001b[39m# ========================\u001b[39;00m\n",
|
|||
|
"\u001b[1;31mNameError\u001b[0m: name 'AgglomerativeClustering' is not defined"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Добавляем индекс строки как дополнительный признак\n",
|
|||
|
"data_scaled_with_index = np.hstack((data_scaled, np.arange(data_scaled.shape[0]).reshape(-1, 1)))\n",
|
|||
|
"\n",
|
|||
|
"# ========================\n",
|
|||
|
"# Агломеративная кластеризация\n",
|
|||
|
"# ========================\n",
|
|||
|
"agg_cluster = AgglomerativeClustering(n_clusters=3) \n",
|
|||
|
"labels_agg = agg_cluster.fit_predict(data_scaled)\n",
|
|||
|
"\n",
|
|||
|
"# ========================\n",
|
|||
|
"# Оценка качества кластеризации\n",
|
|||
|
"# ========================\n",
|
|||
|
"silhouette_avg_agg = silhouette_score(data_scaled, labels_agg)\n",
|
|||
|
"print(f'Средний коэффициент силуэта (агломеративная кластеризация): {silhouette_avg_agg:.3f}')\n",
|
|||
|
"\n",
|
|||
|
"# ========================\n",
|
|||
|
"# Визуализация кластеров\n",
|
|||
|
"# ========================\n",
|
|||
|
"pca = PCA(n_components=2)\n",
|
|||
|
"df_pca = pca.fit_transform(data_scaled_with_index)\n",
|
|||
|
"\n",
|
|||
|
"plt.figure(figsize=(10, 7))\n",
|
|||
|
"sns.scatterplot(x=df_pca[:, 0], y=df_pca[:, 1], hue=labels_agg, palette='viridis', alpha=0.7)\n",
|
|||
|
"plt.title('Визуализация кластеров с помощью агломеративной кластеризации')\n",
|
|||
|
"plt.xlabel('Первая компонента PCA')\n",
|
|||
|
"plt.ylabel('Вторая компонента PCA')\n",
|
|||
|
"plt.legend(title='Кластер', loc='upper right')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Значение коэффициента силуэта лежит в диапазоне от -1 до 1. Ближе к 1: Хорошо сформированные, плотные кластеры, четко отделенные друг от друга. \n",
|
|||
|
"\n",
|
|||
|
"Ближе к 0: Кластеры пересекаются или слабо разделены, не имеют четких границ. Точки расположены одинаково близко как к своему кластеру, так и к соседним. \n",
|
|||
|
"Ближе к -1 (Отрицательные значения): Некоторые точки скорее относятся к другим кластерам, чем к текущему (ближе к центрам других кластеров). Очень плохая кластеризация. \n",
|
|||
|
"Ближе к 1: Все точки внутри каждого кластера плотно сгруппированы и значительно удалены от точек других кластеров. Свидетельствует о четкой и хорошо разделенной структуре данных. Единица говорит об идеальной кластеризации.\n",
|
|||
|
"\n",
|
|||
|
"Средний коэффициент силуэта, равный 0.724, указывает на то, что кластеры имеют хорошее разделение и четкие границы. Точки внутри каждого кластера достаточно плотно сгруппированы и значительно удалены от точек других кластеров, что свидетельствует о четкой и хорошо разделенной структуре данных."
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": "Python 3",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.12.0"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|