689 lines
335 KiB
Plaintext
Raw Permalink Normal View History

2024-12-20 22:35:37 +04:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Лабораторная 5\n",
"\n",
"Датасет: Информация об онлайн обучении учеников\n",
"\n",
"## Бизнес-цель\n",
"Улучшение доступа к онлайн-образованию для учеников с низким уровнем финансового обеспечения."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['Education Level', 'Institution Type', 'Gender', 'Age', 'Device',\n",
" 'IT Student', 'Location', 'Financial Condition', 'Internet Type',\n",
" 'Network Type', 'Flexibility Level'],\n",
" dtype='object')\n"
]
}
],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"\n",
"df = pd.read_csv(\"..\\\\static\\\\csv\\\\students_adaptability_level_online_education.csv\")\n",
"print(df.columns)\n",
"\n",
"map_flexibility_to_int = {'Low': 0, 'Moderate': 1, 'High': 2}\n",
"\n",
"df['Flexibility Level'] = df['Flexibility Level'].map(map_flexibility_to_int).astype('int32')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Конвеер из 4 лабораторной"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Age</th>\n",
" <th>Flexibility Level</th>\n",
" <th>Education Level_School</th>\n",
" <th>Education Level_University</th>\n",
" <th>Institution Type_Public</th>\n",
" <th>Gender_Male</th>\n",
" <th>Device_Mobile</th>\n",
" <th>Device_Tab</th>\n",
" <th>IT Student_Yes</th>\n",
" <th>Location_Town</th>\n",
" <th>Financial Condition_Poor</th>\n",
" <th>Financial Condition_Rich</th>\n",
" <th>Internet Type_Wifi</th>\n",
" <th>Network Type_3G</th>\n",
" <th>Network Type_4G</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1.018272</td>\n",
" <td>0.510309</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.018272</td>\n",
" <td>0.510309</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.160338</td>\n",
" <td>0.510309</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>-1.040771</td>\n",
" <td>0.510309</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.160338</td>\n",
" <td>-1.107907</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1200</th>\n",
" <td>0.160338</td>\n",
" <td>-1.107907</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1201</th>\n",
" <td>0.160338</td>\n",
" <td>0.510309</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1202</th>\n",
" <td>-1.040771</td>\n",
" <td>0.510309</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1203</th>\n",
" <td>0.160338</td>\n",
" <td>-1.107907</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1204</th>\n",
" <td>-1.040771</td>\n",
" <td>0.510309</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1205 rows × 15 columns</p>\n",
"</div>"
],
"text/plain": [
" Age Flexibility Level Education Level_School \\\n",
"0 1.018272 0.510309 0.0 \n",
"1 1.018272 0.510309 0.0 \n",
"2 0.160338 0.510309 0.0 \n",
"3 -1.040771 0.510309 1.0 \n",
"4 0.160338 -1.107907 1.0 \n",
"... ... ... ... \n",
"1200 0.160338 -1.107907 0.0 \n",
"1201 0.160338 0.510309 0.0 \n",
"1202 -1.040771 0.510309 1.0 \n",
"1203 0.160338 -1.107907 0.0 \n",
"1204 -1.040771 0.510309 1.0 \n",
"\n",
" Education Level_University Institution Type_Public Gender_Male \\\n",
"0 1.0 0.0 1.0 \n",
"1 1.0 0.0 0.0 \n",
"2 0.0 1.0 0.0 \n",
"3 0.0 0.0 0.0 \n",
"4 0.0 0.0 0.0 \n",
"... ... ... ... \n",
"1200 0.0 0.0 0.0 \n",
"1201 0.0 0.0 0.0 \n",
"1202 0.0 0.0 1.0 \n",
"1203 0.0 0.0 0.0 \n",
"1204 0.0 0.0 0.0 \n",
"\n",
" Device_Mobile Device_Tab IT Student_Yes Location_Town \\\n",
"0 0.0 1.0 0.0 1.0 \n",
"1 1.0 0.0 0.0 1.0 \n",
"2 1.0 0.0 0.0 1.0 \n",
"3 1.0 0.0 0.0 1.0 \n",
"4 1.0 0.0 0.0 1.0 \n",
"... ... ... ... ... \n",
"1200 1.0 0.0 0.0 1.0 \n",
"1201 1.0 0.0 0.0 0.0 \n",
"1202 1.0 0.0 0.0 1.0 \n",
"1203 1.0 0.0 0.0 0.0 \n",
"1204 1.0 0.0 0.0 1.0 \n",
"\n",
" Financial Condition_Poor Financial Condition_Rich Internet Type_Wifi \\\n",
"0 0.0 0.0 1.0 \n",
"1 0.0 0.0 0.0 \n",
"2 0.0 0.0 1.0 \n",
"3 0.0 0.0 0.0 \n",
"4 1.0 0.0 0.0 \n",
"... ... ... ... \n",
"1200 0.0 0.0 1.0 \n",
"1201 0.0 0.0 1.0 \n",
"1202 0.0 0.0 0.0 \n",
"1203 0.0 0.0 1.0 \n",
"1204 1.0 0.0 0.0 \n",
"\n",
" Network Type_3G Network Type_4G \n",
"0 0.0 1.0 \n",
"1 0.0 1.0 \n",
"2 0.0 1.0 \n",
"3 0.0 1.0 \n",
"4 1.0 0.0 \n",
"... ... ... \n",
"1200 0.0 1.0 \n",
"1201 0.0 1.0 \n",
"1202 1.0 0.0 \n",
"1203 0.0 1.0 \n",
"1204 1.0 0.0 \n",
"\n",
"[1205 rows x 15 columns]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.discriminant_analysis import StandardScaler\n",
"from sklearn.preprocessing import OneHotEncoder\n",
"\n",
"# columns_to_drop = ['Age', 'Education Level', 'Gender', 'IT Student', 'Flexibility Level']\n",
"num_columns = [\n",
" column\n",
" for column in df.columns\n",
" if df[column].dtype != \"object\"\n",
"]\n",
"cat_columns = [\n",
" column\n",
" for column in df.columns\n",
" if df[column].dtype == \"object\"\n",
"]\n",
"\n",
"num_imputer = SimpleImputer(strategy=\"median\")\n",
"num_scaler = StandardScaler()\n",
"preprocessing_num = Pipeline(\n",
" [\n",
" (\"imputer\", num_imputer),\n",
" (\"scaler\", num_scaler),\n",
" ]\n",
")\n",
"\n",
"cat_imputer = SimpleImputer(strategy=\"constant\", fill_value=\"unknown\")\n",
"cat_encoder = OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False, drop=\"first\")\n",
"preprocessing_cat = Pipeline(\n",
" [\n",
" (\"imputer\", cat_imputer),\n",
" (\"encoder\", cat_encoder),\n",
" ]\n",
")\n",
"\n",
"features_preprocessing = ColumnTransformer(\n",
" verbose_feature_names_out=False,\n",
" transformers=[\n",
" (\"prepocessing_num\", preprocessing_num, num_columns),\n",
" (\"prepocessing_cat\", preprocessing_cat, cat_columns),\n",
" ],\n",
" remainder=\"passthrough\"\n",
")\n",
"\n",
"pipeline_end = Pipeline(\n",
" [\n",
" (\"features_preprocessing\", features_preprocessing),\n",
" ]\n",
")\n",
"\n",
"preprocessing_result = pipeline_end.fit_transform(df)\n",
"preprocessed_df = pd.DataFrame(\n",
" preprocessing_result,\n",
" columns=pipeline_end.get_feature_names_out(),\n",
")\n",
"\n",
"preprocessed_df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Понижение размерности (PCA) и визуализация данных."
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA1kAAAIjCAYAAADxz9EgAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAC2CUlEQVR4nOzdeXhTVfoH8G/apmnTNGnTFbrvYGllrYILoGwuMIDjuMz8AHdnZFxAHJlxARxEFBT3ZcYRR8VxZHCZcYHK4gaKIFhEKN1LC3Rv03RJ0/b+/igJTbO3uU3Sfj/Pw6O99yY5OU1z73vPe94jEQRBABEREREREbmEj7sbQERERERENJQwyCIiIiIiInIhBllEREREREQuxCCLiIiIiIjIhRhkERERERERuRCDLCIiIiIiIhdikEVERERERORCDLKIiIiIiIhciEEWERERERGRCzHIIiIiIiIiciEGWUQ0rGzevBkSicTkX2RkJKZPn47PPvvM3c0jIpFNmzbN5O9frVZj0qRJ+Mc//oHu7m6z4/fs2YOFCxciOjoa/v7+iIyMxNy5c7Ft2zaLz3/s2DFIJBIEBASgsbFR5HdDRJ6KQRYRDUtr1qzBW2+9hX/+85944IEHUFNTgyuvvBL/+9//3N00IhJZbGws3nrrLbz11lt4+OGH0dnZiVtuuQV//vOfTY579NFHMX36dPz888+444478Morr2DFihXQarW45pprsGXLFrPnfvvttxEdHQ0A2Lp166C8HyLyPBJBEAR3N4KIaLBs3rwZN910E3744QdMnDjRuL2hoQFRUVG49tpr8c4777ixhUQkpmnTpqG2thY///yzcVtraysyMjLQ0NCAhoYGSKVSbN26Fddeey1+/etfY8uWLZBKpSbPs337duj1elx99dXGbYIgIDk5GQsXLkRJSQkaGhqwe/fuQXtvROQ5OJJFRAQgJCQEgYGB8PPzM24rLS2FRCLB5s2bTY696667IJFIsGTJEuO2bdu2IScnB2q1GoGBgRg1ahTWr18Pw32s3bt3QyKR4IMPPjB77S1btkAikWDfvn0AgLy8PCxZsgTJyckICAhAdHQ0br75ZtTV1Vlse2JiolkKpEQiwZ49e0yO6d1eAHj//fchkUiQmJho3Jafn4/LLrsM0dHRkMlkiIuLw5133on6+nrjMR0dHXjkkUcwYcIEqFQqBAUF4ZJLLjG7mDT034YNG8zaPGbMGEybNs1k27Rp08y2/fDDD8b305tWq8Xy5cuRnJwMqVRq8r5ra2st9lNflvqsb7+J8V737NkDiURicZRDoVCY/J4M6a0HDhyw+j769tvixYsREBCAY8eOmRw3e/ZshIaG4tSpU1afy/A+rP3r+/uprq7GLbfcgqioKAQEBOD888/Hm2++afa83d3dePbZZ5GVlYWAgABERERgzpw5Zu/LUjqvpddtbGzEvffei7i4OMhkMqSmpmL9+vUW0/0cIZfLceGFF6KlpQU1NTUAgIcffhhqtRr/+Mc/zAIsoKc/ewdYAPDtt9+itLQU119/Pa6//np89dVXqKio6FebiMi7+dk/hIho6GlqakJtbS0EQUB1dTWef/55aLVa/O53v7P5uMLCQvztb38z267RaHDBBRdg8eLFkEql+Pzzz/Hggw/Cz88Py5cvx7Rp0xAXF4d33nkHCxYsMHnsO++8g5SUFEyePBkAkJubi+LiYtx0002Ijo7G0aNH8dprr+Ho0aP47rvvzAIOALjkkktw++23A+iZE/L444/bfB+dnZ34y1/+Yra9paUFsbGxmDt3LpRKJX7++We8+OKLqKysxH//+1/je/373/+OG264Abfddhuam5vx+uuvY/bs2di/fz/Gjh1r87Wd8ac//cni9hUrVuCVV17BLbfcgosuughSqRTbtm2zGMTaMnPmTCxatAhAT0D33HPPmewfzPfqKs8++yx27dqFxYsXY9++ffD19cWrr76KHTt24K233sLIkSPtPscNN9yAK6+80mTbypUrTX5ua2vDtGnTUFhYiKVLlyIpKQnvv/8+lixZgsbGRtxzzz3GY2+55RZs3rwZV1xxBW699VZ0dnbi66+/xnfffWcyomzwzDPPIDw8HACwdu1ak32tra2YOnUqKisrcccddyA+Ph579+7FypUrcfr0aWzatMnRrjJRXFwMX19fhISEoKCgAMePH8fNN9+M4OBgh5/D8Lc8adIkjBkzBnK5HO+++y5WrFjRrzYRkRcTiIiGkTfeeEMAYPZPJpMJmzdvNjm2pKREACC88cYbxm2/+c1vhDFjxghxcXHC4sWLbb7WeeedJ1x99dXGn1euXCnIZDKhsbHRuK26ulrw8/MTHn30UeO21tZWs+d69913BQDCV199ZbYvJiZGuOmmm4w/7969WwAg7N6927gtISHBpL0vvfSSIJPJhOnTpwsJCQk238cf/vAHQaFQGH/u7OwUdDqdyTENDQ1CVFSUcPPNNxu3GfrvqaeeMnvOzMxMYerUqSbbpk6darLt008/FQAIc+bMEfqerkaMGCHMnj3bZNujjz4qABBqampsvh9BEISOjg4BgLB06VLjtvfff9+s38R4r4bfz/vvv292bFBQkMnvyfB5/eGHH6y+l779JgiCsH37dgGA8Ne//lUoLi4WFAqFMH/+fKvP0Z/3sWnTJgGA8Pbbbxu3dXR0CJMnTxYUCoWg0WgEQRCEXbt2CQCEu+++2+w5u7u7TX7+29/+JgAQysrKrL6/xx57TAgKChJOnDhh8tgHH3xQ8PX1FcrLy22+x6lTpwqjRo0SampqhJqaGuHYsWPC3XffLQAQ5s6dKwiCIHz00UcCAOGZZ56x+Vy9dXR0CGFhYcJf/vIX47Ybb7xROP/88x1+DiIaOpguSETD0osvvojc3Fzk5ubi7bffxvTp03HrrbdarRgGAAcPHsT777+PdevWwcfH8tdnbW0tKioqsHnzZhQWFuLSSy817lu0aBF0Op1Jmth7772Hzs5OkxG0wMBA4/+3t7ejtrYWF154IQDgxx9/NHvNjo4OyGQyh997a2sr1qxZg6VLlyI+Pt7iMU1NTaiqqsLOnTvxySefmLwPX19f+Pv7A+hJA6uvr0dnZycmTpxosX39IQgCVq5ciWuuuQYXXHCB2f7m5maEhYX1+/nb29sBAAEBATaPc/a9tra2ora21uRfV1eXxedubm42O9Yaw8hrc3OzQ+9v1qxZuOOOO7BmzRosXLgQAQEBePXVVx16rKM+/fRTREdH44YbbjBuk0qluPvuu6HVavHll18CAP7zn/9AIpHg0UcfNXuOvqOyHR0dAGDz8/z+++/jkksuQWhoqEnfzZgxA11dXfjqq6/stv348eOIiIhAREQERo8ejeeffx5XXXUV/vGPfwDoGcEE4NQo1meffYa6ujqT/rjhhhvw008/4ejRow4/DxENDQyyiGhYysnJwYwZMzBjxgz89re/xSeffILzzjsPS5cuNV7o9fXggw/ikksuMZuHYdDe3o6IiAjExcXh5ptvxooVK0zShEaNGoVJkyaZFNZ45513cOGFFyI1NdW4rb6+Hvfccw+ioqIQGBiIiIgIJCUlAei52O6rqakJCoXC4ff+9NNPo7293aySWm+zZ89GdHQ0ZsyYgdGjR+O9994z2f/mm28iOzsbAQEBCAsLQ0REBD755BOL7euPd955B0ePHrWa9jh58mR88MEH2Lp1K06fPo3a2lq0trY6/PyGgEalUtk91pn3+uijjxov3g3/jh8/bvF5b775ZrNjW1paLB47Y8YMREREQKlUIjQ0FH/4wx+sHmuwYcMGqNVqHD58GM899xwiIyPtvldnlJWVIS0tzeyGw+jRo437AaCoqAgjR46EWq22+5yGkue2Ps8FBQX4/PPPzfpuxowZAHrmidmTmJiI3NxcfPHFF/jmm29w5swZ/O9//zOmKCqVSgBwOKgFeqoKJiUlQSaTobCwEIWFhUhJSYFcLmcxHaJhiHOyiIgA+Pj4YPr06Xj22WdRUFCAzMxMk/07duzAF198YSxOYYm/vz9yc3PR2tqKr7/+GuvXr0dcXBz
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from sklearn.decomposition import PCA\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"rnd_state=9\n",
"\n",
"pca = PCA(n_components=2, random_state=rnd_state)\n",
"data_pca = pca.fit_transform(preprocessed_df)\n",
"\n",
"df_pca = pd.DataFrame(data_pca, columns=['Principal Component 1', 'Principal Component 2'])\n",
"\n",
"plt.figure(figsize=(10, 6))\n",
"sns.scatterplot(\n",
" x='Principal Component 1',\n",
" y='Principal Component 2',\n",
" data=df_pca,\n",
" alpha=0.6\n",
")\n",
"plt.title('Визуализация данных после PCA')\n",
"plt.xlabel('Главная компонента 1')\n",
"plt.ylabel('Главная компонента 2')\n",
"plt.grid(True)\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Выбор количества кластеров с помощью двух методов: инерция и коэффициент силуэта."
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABKUAAAHqCAYAAADVi/1VAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAADKU0lEQVR4nOzdd1yV5f/H8ddhuwARWYqKW9ziyK1lao40s9IsrUzLHJUtbbgafrOhDdMsSysr08zUypF75Tb3QtwCIjKVee7fH8b5RYAiAjfj/Xw8zuMb932d+7zv8wW5+Zzr+twWwzAMRERERERERERE8pGd2QFERERERERERKT4UVFKRERERERERETynYpSIiIiIiIiIiKS71SUEhERERERERGRfKeilIiIiIiIiIiI5DsVpUREREREREREJN+pKCUiIiIiIiIiIvlORSkREREREREREcl3KkqJiIiIiIiIiEi+U1FKRERERERERETynYpSInJDc+bMwWKxYLFY2LRpU4b9hmHg7++PxWKhR48eJiQUERGR4iztWmXnzp0Z9n3xxRdYLBZ69+5NampqvuTp0aMHVapUueXnjRgxAovFkvuBREQKMBWlRCRbXFxc+P777zNsX79+PefOncPZ2dmEVCIiIiKZ++WXXxg2bBht27blxx9/xN7e3uxIIiLyHypKiUi2dOvWjQULFpCSkpJu+/fff09QUBA+Pj4mJRMRERFJb926dfTv35/AwECWLl2Ki4uL2ZFERCQTKkqJSLb079+fy5cvs2rVKtu2pKQkFi5cyMMPP5zpc6xWK9OmTaNu3bq4uLjg7e3NU089xZUrV2xjqlSpYlsemNnj39Pf4+PjeeGFF/D398fZ2ZlatWrx/vvvYxhGhtdet25dlse8mbTnrlu3Lt327t27Y7FYmDBhgm3bhAkTsFgsREREpBu7c+dOLBYLc+bMSbf9yJEj9O3bFw8PD1xcXGjatClLlixJNyZtGcKGDRt46qmnKFeuHK6urgwcODDde5f2/mW2bDKzJQBpWf8tLi4OHx+fDOfboUMHOnTokG7sjh07sv0eioiImGXv3r306tULX19fVqxYgZubW4YxCxYsICgoiBIlSuDp6ckjjzzC+fPnbfvPnz9P//79qVChAs7OzlStWpWXX36Z2NjYDMf69ttv8ff3x93dncmTJ9u2z58/Hz8/Pzw9PXn33XczPG/FihXUrFmT0qVLM2rUKNv1zLp166hWrRqurq6MHj063bLDvLpGOXXqVKbXLcOHD8disfDYY4+l2x4VFcVzzz1nuyarXr067777LlarNcMx33///QznXq9ePdt1xo2u2dIe/z6vzCQkJDBhwgRq1qyJi4sLvr6+9OnTh+Dg4BydH1y/FsosS9oxxo8fj6OjI5cuXcrw3KFDh+Lu7k5CQgIbN26kU6dOeHp6UqJECRo3bsyMGTPSXb9m9VqZXb+mpKTw5ptvUq1aNZydnalSpQqvvvoqiYmJ6TL8+xrbzs4OHx8fHnroIc6cOXPD91IkvzmYHUBECocqVarQsmVLfvjhB+655x4A/vjjD6Kjo+nXrx8ff/xxhuc89dRTzJkzh8cff5xRo0YREhLCp59+yp49e9i8eTOOjo5MmzaNuLg4AA4fPsw777zDq6++Sp06dQAoXbo0cL131b333svatWsZPHgwjRo1YsWKFbz00kucP3+eqVOnZpp71KhRNGvWDIBvvvkmXVHtVmzYsIHff/89R89Nc/DgQVq3bk2FChUYM2YMpUqV4qeffqJ37978/PPP3HfffenGjxgxAnd3dyZMmMDRo0eZMWMGp0+ftl285YYPPviAsLCwbI195ZVXcuU1RURE8kpwcDBdu3bF2dmZFStW4Ovrm2FM2rVJs2bNmDx5MmFhYXz00Uds3ryZPXv24O7uTnBwMGFhYYwcOZKyZcty8OBBPv74Y1avXs2mTZsoUaIEAJs3b2bQoEG0atWK/v378+2333Ly5EmuXbvGpEmTePXVV1m5ciVjxoyhUqVK9O/fH4CTJ0/Su3dvqlevzjvvvMPy5cttPbGGDx/OyJEj2bNnD1OnTqV8+fKMHTs2y3POjWuUzJw4cYIvvvgiw/arV6/Svn17zp8/z1NPPUWlSpXYsmULY8eO5eLFi0ybNu2WXqdOnTp8++23tq9nzZrF4cOH013bNWjQIMvnp6am0qNHD1avXk2/fv149tlniY2NZdWqVRw4cIBq1ard0vn9W+3atXnttdcAiIiI4Pnnn7fte/TRR5k0aRLz589nxIgRtu1pH9ref//9uLi4sGXLFry8vHj99dext7dn/fr1PPPMM+zbt48ZM2YA8Nprr/Hkk0+me52hQ4fStm3bDJmefPJJ5s6dS9++fXnhhRfYtm0bkydP5vDhw/zyyy/pxrZt25ahQ4ditVo5cOAA06ZN48KFC2zcuPGG5y2SrwwRkRv4+uuvDcDYsWOH8emnnxplypQxrl69ahiGYTzwwANGx44dDcMwjMqVKxvdu3e3PW/jxo0GYMybNy/d8ZYvX57pdsMwjLVr1xqAsXbt2gz7Fi9ebADGW2+9lW573759DYvFYpw4cSLd9pUrVxqAsXDhQtu24cOHG9n5Zy+zHC1atDDuueceAzDGjx9v2z5+/HgDMC5dupTuGDt27DAA4+uvv7Ztu+uuu4z69esbCQkJtm1Wq9Vo1aqVUaNGDdu2tPc8KCjISEpKsm2fMmWKARi//vqrbdt/3/cbnWta1jTh4eFGmTJlbOf17/Nt37690b59e9vXv//+uwEYXbt2zdZ7KCIikl/Sfm8uW7bMqFatmgEYnTt3znRsUlKS4eXlZdSrV8+4du2abfuyZcsMwBg3blyWr7Nq1SoDMCZNmmTbdu+99xoBAQG23+2xsbFGQECAUbJkSePkyZOGYVz/Xd+6dWujYcOGtueNGjXKKFOmjBEREWEYhmEkJycbd9xxhwEY27Zts43r37+/4eXlZTt+Xl2jhISEZNj24IMPGvXq1TP8/f2NQYMG2ba/+eabRqlSpYxjx46lO+6YMWMMe3t748yZM+mO+d5772V4L+vWrZvuOuPfBg0aZFSuXDnTfZn56quvDMD48MMPM+yzWq23fH5pWrdubbvOzeoYLVu2NFq0aJHueYsWLcryejbNa6+9ZgDGhg0bMuzL7HXS7N271wCMJ598Mt32F1980QCMNWvW2LZVrlw5w3k9/PDDRsmSJbPMJWIGLd8TkWx78MEHuXbtGsuWLSM2NpZly5ZluXRvwYIFuLm5cffddxMREWF7BAUFUbp0adauXXtLr/37779jb2/PqFGj0m1/4YUXMAyDP/74I932hIQEgFzpIbFo0SJ27NjB//73vyzHREZGpjvP6OjoDPvXrFnDgw8+SGxsrG3c5cuX6dKlC8ePH0+3bACuT/12dHS0fT1s2DAcHBxy7dPQN998Ezc3twzv6X8ZhsHYsWO5//77adGiRa68toiISG577LHHOHv2LA8//DArV65kwYIFGcbs3LmT8PBwnnnmmXTXCN27d6d27dr89ttvtm3Jycnpfrc3atSIpk2bpjvu6tWr6datm+2GL6VLlyYwMJDy5csTEBAAYLv7399//83ly5dtz2vXrh3lypUDwMHBgaCgIACaN29uO36fPn0IDw/nwIEDmZ5zdq5RcmLXrl0sWLCAyZMnY2eX/k/GBQsW0LZtW8qWLZvu/enUqROpqals2LAh3firV6+mGxcREZGrd0L8+eef8fT0ZOTIkRn2ZTWz/EbnlyYpKemmN/IZOHAg27Ztsy0TBJg3bx7+/v60b9/etu2/78GQIUNwdHTM9Hv0RtKuAUePHp1u+wsvvACQ7vsXIDExkYiICMLDw1m1ahVr1qzhrrvuuqXXFMlrKkqJSLaVL1+eTp068f3337No0SJSU1Pp27dvpmOPHz9OdHQ0Xl5elC9fPt0jLi6O8PDwW3rt06dP4+fnR5kyZdJtT1vmd/r06XTb0/onZNZH4lakpqby6quvMmDAgBtOHa9Vq1a6c+zUqVO6/SdOnMAwDN54440M78f48eMBMrwnNWrUSPd16dK
"text/plain": [
"<Figure size 1200x500 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from sklearn.cluster import KMeans\n",
"from sklearn.metrics import silhouette_score\n",
"\n",
"import os\n",
"os.environ['OMP_NUM_THREADS'] = '12'\n",
"\n",
"inertia = []\n",
"silhouette_scores = []\n",
"k_range = range(2, 11)\n",
"\n",
"for k in k_range:\n",
" kmeans = KMeans(n_clusters=k, random_state=rnd_state)\n",
" kmeans.fit(data_pca)\n",
" inertia.append(kmeans.inertia_)\n",
" silhouette_scores.append(silhouette_score(data_pca, kmeans.labels_, random_state=rnd_state))\n",
"\n",
"plt.figure(figsize=(12, 5))\n",
"\n",
"plt.subplot(1, 2, 1)\n",
"plt.plot(k_range, inertia, marker='o')\n",
"plt.title('Метод инерции')\n",
"plt.xlabel('Количество кластеров')\n",
"plt.ylabel('Инерция')\n",
"\n",
"plt.subplot(1, 2, 2)\n",
"plt.plot(k_range, silhouette_scores, marker='o')\n",
"plt.title('Коэффициент силуэтов')\n",
"plt.xlabel('Количество кластеров')\n",
"plt.ylabel('Оценка силуэтов')\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Судя по графикам видно, что при n=8 коэффициент силуэтов максимален, а уменьшение инерции минимально. Выбираем n=8."
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"clusters_count = 8"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Кластерный анализ. Иерархическая и неиерархическая кластеризация."
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAr4AAAIjCAYAAADlfxjoAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAADnZUlEQVR4nOzdd3wU1RbA8d/MbE1PSOi9CFIERDoKKMWCiF1RESxYsOtTsYEVFbGBioqKig0sYEEUVGygSFNsFOklIQmkb595f2wSsmSTbCDJJtnzfZ/9PHfmzp0zSUjO3rlzrmIYhoEQQgghhBD1nBruAIQQQgghhKgJkvgKIYQQQoiIIImvEEIIIYSICJL4CiGEEEKIiCCJrxBCCCGEiAiS+AohhBBCiIggia8QQgghhIgIkvgKIYQQQoiIIImvEEIIIYSICJL4CiGEEEKIiCCJrxB10Ny5c1EUhdWrVwfdP2TIELp27VrDUQkhhBC1myS+QgghhBAiIkjiK4QQQgghIoIkvkJEkHnz5tGrVy/sdjtJSUlcdNFF7Nq1K6DNkCFDGDJkSMC23377DUVRUBTliPvs2rUra9asYcCAAdjtdtq0acPs2bMD2rndbh544AF69epFfHw80dHRnHjiiXz33XcB7bZv346iKDz11FM888wztGrVCrvdzuDBg/nzzz+L2+3fv5+UlBSGDBmCYRjF27ds2UJ0dDQXXnhhpa97+fLlKIrC8uXLA9qOHz+e1q1bB2zTdZ1nn32WLl26YLPZaNSoEddccw0HDx4s9XX88ssvGTx4MLGxscTFxdG7d2/efffdcuN79NFHUVU1oN2PP/7I+eefT8uWLbFarbRo0YJbb70Vh8NR6pwffvghJ5xwArGxscXXWfR1LU/RVJvt27cXb/vrr79ITExk1KhReL3egPZDhgwJ6L/oNXfu3COK+99//+WCCy4gJSUFu91Ox44duffeewGYOnVq0HOVfJX83v3666+ceuqpxMfHExUVxeDBg/n5558DzlfUZ9F54+LiaNCgATfffDNOpzOgraIoTJ06NWDb9OnTURQl4Pv3008/MWjQIJKTk7HZbLRt25a77roroL8DBw5wxx130K1bN2JiYoiLi+O0007j999/D+i/6Gfyww8/LPW1iomJYfz48QHbWrduXWrbggULUBSl1M/w/v37ufLKK2nZsiWaphV/DWNiYkqdS4i6wBTuAIQQRy47O5uMjIxS2z0eT6ltjz76KPfffz8XXHABV111Fenp6cycOZOTTjqJdevWkZCQUOZ57rrrrqDbK9PnwYMHOf3007ngggu4+OKLmT9/Ptdddx0Wi4UrrrgCgJycHObMmcPFF1/M1VdfTW5uLq+99hojR45k1apV9OjRI+D8b731Frm5uUyaNAmn08lzzz3HySefzIYNG2jUqBENGzbkpZde4vzzz2fmzJncdNNN6LrO+PHjiY2N5cUXXyz361vWdYfqmmuuYe7cuUyYMIGbbrqJbdu2MWvWLNatW8fPP/+M2WwG/InkFVdcQZcuXZg8eTIJCQmsW7eOJUuWMHbs2KB9v/HGG9x3333MmDEjoM2CBQsoKCjguuuuo0GDBqxatYqZM2eye/duFixYUNxu5cqVXHDBBXTv3p3HH3+c+Ph4MjIyuPXWWyt9nbt27eLUU0+lU6dOzJ8/H5Op9J+WTp06FSenwc4Tatx//PEHJ554ImazmYkTJ9K6dWv+++8/PvvsMx599FHOOecc2rdvX9z+1ltv5dhjj2XixInF24499lgAvv32W0477TR69erFlClTUFWVN954g5NPPpkff/yRPn36BMR4wQUX0Lp1a6ZNm8Yvv/zC888/z8GDB3nrrbfK/NpkZWUxbdq0Uttzc3M59thjueCCC4iKimLlypU8+eSTFBQUMHPmTAC2bt3KwoULOf/882nTpg1paWm8/PLLDB48mL///pumTZuWed7K8Hq9xd+bw11++eUsW7aMG2+8ke7du6NpGq+88gpr166tknMLUeMMIUSd88YbbxhAua8uXboUt9++fbuhaZrx6KOPBvSzYcMGw2QyBWwfPHiwMXjw4OL3ixcvNgDj1FNPNUr+yqhsn4AxY8aM4m0ul8vo0aOH0bBhQ8PtdhuGYRher9dwuVwB/R08eNBo1KiRccUVVxRv27ZtmwEYdrvd2L17d/H2X3/91QCMW2+9NaCPiy++2IiKijI2bdpkTJ8+3QCMhQsXBrQJ9bq///57AzC+/fbbgOMvv/xyo1WrVsXvf/zxRwMw3nnnnYB2S5YsCdielZVlxMbGGn379jUcDkdAW13Xg8b3xRdfGCaTybj99tuNwxUUFJTaNm3aNENRFGPHjh3F2yZPnmwAxr59+4q3FX1dp0+fXqqPkop+/rZt22YcOHDA6Ny5s9GxY0cjIyMjaPuBAwcaQ4cOLXWeN954o9Jxn3TSSUZsbGzANsMI/FqV1KpVK+Pyyy8vtV3XdaNDhw7GyJEjA44tKCgw2rRpYwwfPrx425QpUwzAGD16dEAf119/vQEYv//+e/E2wJgyZUrx+zvvvNNo2LCh0atXr4Cfr2BOP/10o2vXrsXvnU6n4fP5Atps27bNsFqtxkMPPVS87bvvvjMAY8GCBaX6jI6OLnX9h39NXnzxRcNqtRpDhw4N+Bl2OByGqqrGNddcE3D85ZdfbkRHR5d7LULUVjLVQYg67IUXXmDp0qWlXscdd1xAu48//hhd17ngggvIyMgofjVu3JgOHTqUmkpQxDAMJk+ezLnnnkvfvn2Pqk+TycQ111xT/N5isXDNNdewf/9+1qxZA4CmaVgsFsA/TeDAgQN4vV5OOOGEoCNMY8aMoVmzZsXv+/TpQ9++fVm8eHFAu1mzZhEfH895553H/fffz2WXXcZZZ51V5te1vOtu2LAhALt37y7zePCPYMbHxzN8+PCAr0+vXr2IiYkp/vosXbqU3Nxc7r77bmw2W0AfwaaWrFq1igsuuIBzzz2X6dOnl9pvt9uL/zs/P5+MjAwGDBiAYRisW7eueF9ubi6qqpY70l8Rp9PJ6NGjSU9PZ8mSJTRo0CBoO7fbjdVqLbevUOJOT0/nhx9+4IorrqBly5YBxwf7WpVn/fr1bN68mbFjx5KZmVn8/cnPz+eUU07hhx9+QNf1gGMmTZoU8P7GG28EKPXzVmTPnj3MnDmT+++/v8ypAQcOHGDfvn0sXLiQlStXctJJJxXvs1qtqKr/z7TP5yMzM5OYmBg6duxYZSOuBQUFPPTQQ9xwww2lvqb5+fnoul7m91WIukimOghRh/Xp04cTTjih1PbExMSAKRCbN2/GMAw6dOgQtJ+iW+6He+edd/jrr7+YP39+wDzSI+mzadOmREdHB2w75phjAP+c3X79+gHw5ptvMmPGDP7999+AKRtt2rQpdY5g5z7mmGOYP39+wLakpCSef/55zj//fBo1asTzzz8fNOYi5V1327Ztady4MU899RTdu3cvvt3scrkC2m3evJns7OziRPlw+/fvB+C///4DCKn83J49ezjjjDPIz88nMzMzaLK3c+dOHnjgAT799NNSc4mzs7OL/7t///7MmjWLm2++mTvvvJP4+Pigc4/LM2HCBH755RdsNlupeb0lZWVl0apVq3L7CiXurVu3AqF9rSqyefNmwH8rvyzZ2dkkJiYWvz/8561du3aoqhow17mkKVOm0LRpU6655pqg828BOnfuTFpaGuCfJ/7cc88V79N1neeee44XX3yRbdu24fP5ivdVVTL69NNP43Q6ueeee7jtttsC9jVo0IAOHTowZ84cBg8eTI8ePVBVtdTPuhB1iSS+QkQAXddRFIUvv/wSTdNK7Q82GuV2u7n//vu58sorixPUo+2zIvPmzWP8+PGMGTOG//3vfzRs2BBN05g2bVpxgnikvvrqK8A/13j37t1ljnRWdN0Wi4VXX32VsWPH0r1794B9JZM7Xddp2LAh77zzTtDzpKSkVPoatmzZwvHHH88zzzzDZZddxptvvhmQuPl8PoYPH86BAwe466676NSpE9HR0ezZs4fx48cHjGBedNFFrF27lpkzZ/L
"text/plain": [
"<Figure size 800x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAApsAAAIYCAYAAAAiticSAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABWL0lEQVR4nO3dd3gU1eL/8U8KKZCChC5FBKQoqDe00EvoCAiIFJWmIFIUrHiVpogKioCAeIGAioqxgF4VCyg2uhTLFUFDh9ATahKS8/uDX/abzW7CbNgxm/B+Pc8+kNmzZ87Mzsx+duacWT9jjBEAAABgA//8bgAAAAAKL8ImAAAAbEPYBAAAgG0ImwAAALANYRMAAAC2IWwCAADANoRNAAAA2IawCQAAANsQNgEAAGAbwiYAIN9kZGTopptu0pQpU/K7KfAhaWlpqlixoubOnZvfTYEXEDZh2eLFi+Xn56dNmza5PDdw4ED5+fnppptuyoeWASio3nnnHe3bt08jR47M76bAhxQpUkRjx47VlClTdOHChfxuDq4QYRNXbNeuXXrrrbfyuxkACqBp06apT58+ioyMzO+mwMcMGjRIx44d09tvv53fTcEVImziik2ZMkVFihRRjRo18rspAAqQLVu2aNu2berdu3d+NwU+qHjx4mrXrp0WL16c303BFSJs4or89ddfeuuttzRs2DCVLVvWbZm33npL0dHRCg0NVYkSJdSnTx/t27fPqUzLli110003afPmzWrcuLFCQ0NVpUoVvfbaa07lUlNTNX78eEVHRysyMlLFihVTs2bN9M033ziV2717t/z8/DR9+nTNmDFDlStXVmhoqFq0aKFff/3VUe7IkSMqVaqUWrZsKWOMY/quXbtUrFgx3XnnnU5tbNmypdN8Nm7cKD8/P/n5+Tmmffvtt/Lz89O3337rVHbgwIG67rrrnKZlZGTolVde0Y033qiQkBCVKVNGw4YN08mTJ13W4+eff64WLVooPDxcERERql+/vtM3fnftmzJlivz9/Z3Kff/997rjjjtUqVIlBQcHq2LFihozZozOnz/vMs/3339f9erVU3h4uGM5M9drbjK7XOzevdsx7bffftM111yjLl266OLFi07lW7Zs6VR/5iPrh4wn7f7jjz/Uu3dvlSpVSqGhoapRo4b+/e9/S5ImTpzodl5ZH1nfu/Xr16tDhw6KjIxU0aJF1aJFC/34449O88usM3O+ERERioqK0oMPPuhyCdDPz08TJ050mjZt2jT5+fk5vX8//PCDmjZtqpIlSyokJETXX3+9Hn/8caf6Tpw4oUceeUR16tRRWFiYIiIi1LFjR23bts2p/sxt8v3333dZV2FhYRo4cKDTtOuuu85lWnx8vPz8/Fy24SNHjmjIkCGqVKmSAgICHOswLCzMZV7ZLV++XEFBQWrevLnb53PaLrKvP8mz40x206dPd9lepf87jrh7ZC2bub1/9913GjZsmKKiohQREaF77rnH7b48d+5c3XjjjQoODlb58uU1YsQInTp1ytKyx8bGOsr4+flp5MiRWrp0qWrUqKGQkBBFR0fru+++c6prz549euCBB1SjRg2FhoYqKipKd9xxh8vyZi5HUFCQjh496vTc2rVrHW3I2pUqs53du3d3Wc5hw4a57V41ffp0NW7cWFFRUQoNDVV0dLTbbVOS2rZtqx9++EEnTpxw+zwKhsD8bgAKtmeffVaBgYF6/PHH1bdvX5fnp0yZoqefflq9e/fWvffeq6NHj2r27Nlq3ry5tmzZouLFizvKnjx5Up06dVLv3r3Vt29fvffeexo+fLiCgoI0ePBgSVJycrIWLFigvn376r777tPp06e1cOFCtW/fXhs2bNAtt9ziNP833nhDp0+f1ogRI3ThwgXNnDlTrVu31i+//KIyZcqodOnSmjdvnu644w7Nnj1bo0ePVkZGhgYOHKjw8PDLdk5//PHHr2j9DRs2TIsXL9agQYM0evRoJSQk6NVXX9WWLVv0448/qkiRIpIufQgMHjxYN954o8aNG6fixYtry5YtWrlypfr16+e27ri4OD311FN66aWXnMrEx8fr3LlzGj58uKKiorRhwwbNnj1b+/fvV3x8vKPc2rVr1bt3b9188816/vnnFRkZqWPHjmnMmDEeL+e+ffvUoUMH1axZU++9954CA10PPTVr1nQEQnfzsdru7du3q1mzZipSpIiGDh2q6667Tn/99Zc++eQTTZkyRT169FC1atUc5ceMGaNatWpp6NChjmm1atWSJK1evVodO3ZUdHS0JkyYIH9/f8XFxal169b6/vvv1aBBA6c29u7dW9ddd52mTp2qdevWadasWTp58qTeeOONHNfNqVOnNHXqVJfpp0+fVq1atdS7d28VLVpUa9eu1Ysvvqhz585p9uzZkqS///5by5cv1x133KEqVaooMTFR8+fPV4sWLfT777+rfPnyOc7XExcvXnS8N9kNGDBAX3/9tUaNGqWbb75ZAQEBev311/Xzzz9ftt6ffvpJN910k2M7d6dChQqO9XPmzBkNHz7cpYwnx5m86Nu3rzp16iRJ+uyzz/TOO++4LTdy5EgVL15cEydO1I4dOzRv3jzt2bPHEfalS19MJk2apNjYWA0fPtxRbuPGjU77fPZlz1SuXDmnv9esWaNly5Zp9OjRCg4O1ty5c9WhQwdt2LDBEfI2btyon376SX369FGFChW0e/duzZs3Ty1bttTvv/+uokWLOtUZEBCgt956y2kfjIuLU0hIiNv+kyEhIfr000915MgRlS5dWpJ0/vx5LVu2TCEhIS7lZ86cqa5du6p///5KTU3Vu+++qzvuuEP//e9/1blzZ6ey0dHRMsbop59+UpcuXdyudxQABrAoLi7OSDIbN240xhjz119/mcDAQDN69GhjjDEtWrQwN954o6P87t27TUBAgJkyZYpTPb/88osJDAx0mt6iRQsjybz00kuOaSkpKeaWW24xpUuXNqmpqcYYYy5evGhSUlKc6jt58qQpU6aMGTx4sGNaQkKCkWRCQ0PN/v37HdPXr19vJJkxY8Y41dG3b19TtGhR8+eff5pp06YZSWb58uVOZVq0aGFatGjh+Puzzz4zkkyHDh1M1l1pzZo1RpJZvXq10+sHDBhgKleu7Pj7+++/N5LM0qVLncqtXLnSafqpU6dMeHi4adiwoTl//rxT2YyMDLft+/TTT01gYKB5+OGHTXbnzp1zmTZ16lTj5+dn9uzZ45g2btw4I8kcOnTIMS1zvU6bNs2ljqwyt5WEhARz4sQJU7t2bVOjRg1z7Ngxt+WbNGliWrVq5TKfuLg4j9vdvHlzEx4e7jTNGOd1lVXlypXNgAEDXKZnZGSY6tWrm/bt2zu99ty5c6ZKlSqmbdu2jmkTJkwwkkzXrl2d6njggQeMJLNt2zbHNElmwoQJjr8fe+wxU7p0aRMdHe20fbnTqVMnc9NNNzn+vnDhgklPT3cqk5CQYIKDg83kyZMd07755hsjycTHx7vUWaxYMZflz75O5s6da4KDg02rVq2ctuHz588bf39/M2zYMKfXDxgwwBQrVizXZTHGmAoVKpiePXvm+Hzjxo2dlvfo0aMu68/T40zWY1SmzH0+ISHBafqff/5pJJnp06fnWjZze4+OjnYcq4wx5sUXXzSSzIoVK4wxxhw5csQEBQWZdu3aOb1vr776qpFkFi1adNm2ZiXJSDKbNm1yTNuzZ48JCQkxt99+u2Oau31n7dq1RpJ54403XJajb9++pk6dOo7pZ8+eNREREaZfv35OnwFZ21m3bl2n9fTmm2+aChUqmGbNmrksR/b2pKammptuusm0bt3apZ0HDx40kswLL7yQ67qAb+MyOvIs86zmE0884fb5Dz/8UBkZGerdu7eOHTvmeJQtW1bVq1d3ufQdGBioYcOGOf4OCgrSsGHDdOTIEW3evFnSpW/cQUFBki5dgj5x4oQuXryoevXquT2T0r17d1177bWOvxs0aKCGDRvqs88+cyr36quvKjIyUr169dLTTz+tu+++W926dctx2Y0xGjdunHr27KmGDRs6PZf5zX7//v0
"text/plain": [
"<Figure size 800x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAr4AAAIjCAYAAADlfxjoAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAADpSElEQVR4nOzdd3gU1dfA8e/MbE2FhCT03pEmIAJKUUABRfRVFAtgRQV7xS6KqOjPghVRUbFiASsKgiKCIk1AlF4DCSEhffvM+8cmIUvaBtL3fJ5nH92ZOzNnl01y9s695yqGYRgIIYQQQghRx6nVHYAQQgghhBBVQRJfIYQQQggREiTxFUIIIYQQIUESXyGEEEIIERIk8RVCCCGEECFBEl8hhBBCCBESJPEVQgghhBAhQRJfIYQQQggREiTxFUIIIYQQIUESXyGEEEIIERIk8RWiDps7dy6KorBmzZoi+yZOnIiiKJxyyinVEJkQQghR9STxFSIE7dixg3nz5lV3GEIIIUSVMlV3AEKIqjd9+nTMZjNt27at7lCEEEKIKiM9vkKEmJ07dzJv3jwmTZpEw4YNi20zb948evXqhd1uJyYmhssuu4z9+/cHtBk8eDCnnHIKa9eupX///tjtdlq1asUbb7wR0M7tdvPII4/Qq1cvoqOjCQ8P58wzz2TZsmUB7fbs2YOiKDz33HO88MILtGjRArvdzqBBg9i8eXNBu8OHDxMXF8fgwYMxDKNg+44dOwgPD+fSSy8NiHHw4MEB1/nrr79QFAVFUQq2/fLLLyiKwi+//BLQduLEibRs2TJgm67rvPjii3Tp0gWbzUZCQgKTJk3i6NGjRd7HH374gUGDBhEZGUlUVBR9+vTho48+KjW+6dOno6pqQLvffvuNSy65hObNm2O1WmnWrBl33HEHDoejyDU///xzevfuTWRkZMHrzH9fS5M/LGbPnj0F2/755x/q16/Peeedh9frDWg/ePDggPPnP+bOnXtCcf/333+MHTuWuLg47HY7HTp04MEHHwTgscceK/ZahR+F/+3+/PNPzj33XKKjowkLC2PQoEH8/vvvAdfLP2f+daOiooiNjeW2227D6XQGtFUUhcceeyxg28yZM1EUJeDfb8WKFZxxxhk0aNAAm81G69atue+++wLOl5aWxt13303Xrl2JiIggKiqKESNG8PfffwecP/8z+fnnnxd5ryIiIpg4cWLAtpYtWxbZNn/+fBRFKfIZPnz4MNdeey3NmzdH07SC9zAiIqLItYSoa6THV4gQ8+STT2IymbjvvvsYN25ckf3Tp0/n4YcfZuzYsVx33XWkpKQwa9YsBg4cyPr166lXr15B26NHjzJy5EjGjh3LuHHj+Oyzz7jpppuwWCxcc801AGRmZjJnzhzGjRvH9ddfT1ZWFm+//TbnnHMOq1evpkePHgHXf//998nKymLy5Mk4nU5eeuklzjrrLDZt2kRCQgLx8fG8/vrrXHLJJcyaNYtbb70VXdeZOHEikZGRvPbaa6W+/vvuu++k3r9JkyYxd+5crr76am699VZ2797NK6+8wvr16/n9998xm82AP5G85ppr6NKlC1OnTqVevXqsX7+eRYsWcfnllxd77nfffZeHHnqI559/PqDN/Pnzyc3N5aabbiI2NpbVq1cza9YsDhw4wPz58wvarVq1irFjx9K9e3eefvppoqOjOXLkCHfccUe5X+f+/fs599xz6dixI5999hkmU9E/Fx07dixITou7TrBxb9y4kTPPPBOz2cwNN9xAy5Yt2blzJ9988w3Tp0/noosuCrg7cccdd9CpUyduuOGGgm2dOnUCYOnSpYwYMYJevXrx6KOPoqoq7777LmeddRa//fYbp512WkCMY8eOpWXLlsyYMYM//viDl19+maNHj/L++++X+N6kp6czY8aMItuzsrLo1KkTY8eOJSwsjFWrVvHss8+Sm5vLrFmzANi1axcLFizgkksuoVWrViQnJ/Pmm28yaNAgtmzZQuPGjUu8bnl4vd6Cf5vjTZgwgSVLlnDLLbfQvXt3NE1j9uzZrFu3rkKuLUSNZggh6qx3333XAIy//vrLMAzD2Llzp2EymYxbb73VMAzDGDRokNGlS5eC9nv27DE0TTOmT58ecJ5NmzYZJpMpYPugQYMMwHj++ecLtrlcLqNHjx5GfHy84Xa7DcMwDK/Xa7hcroDzHT161EhISDCuueaagm27d+82AMNutxsHDhwo2P7nn38agHHHHXcEnGPcuHFGWFiYsW3bNmPmzJkGYCxYsCCgzaBBg4xBgwYVPP/+++8NwDj33HONwr/+fv31VwMwli5dGnD8hAkTjBYtWhQ8/+233wzA+PDDDwPaLVq0KGB7enq6ERkZafTt29dwOBwBbXVdLza+7777zjCZTMZdd91lHC83N7fIthkzZhiKohh79+4t2DZ16lQDMA4dOlSwLf99nTlzZpFzFJb/Wdm9e7eRlpZmdO7c2ejQoYNx5MiRYtsPGDDAGDJkSJHrvPvuu+WOe+DAgUZkZGTANsMIfK8Ka9GihTFhwoQi23VdN9q1a2ecc845Acfm5uYarVq1MoYNG1aw7dFHHzUAY/To0QHnuPnmmw3A+Pvvvwu2Acajjz5a8Pzee+814uPjjV69egV8voozcuRI45RTTil47nQ6DZ/PF9Bm9+7dhtVqNaZNm1awbdmyZQZgzJ8/v8g5w8PDi7z+49+T1157zbBarcaQIUMCPsMOh8NQVdWYNGlSwPETJkwwwsPDS30tQtQFMtRBiBCS39t7//33F7v/yy+/RNd1xo4dy5EjRwoeDRs2pF27dkWGJ5hMJiZNmlTw3GKxMGnSJA4fPszatWsB0DQNi8UC+IcJpKWl4fV66d27d7E9TGPGjKFJkyYFz0877TT69u3L999/H9DulVdeITo6mosvvpiHH36Yq666igsuuKDE124YBlOnTuX//u//6Nu3b8C++Ph4AA4cOFDi8eDvwYyOjmbYsGEB70+vXr2IiIgoeH8WL15MVlYW999/PzabLeAchYdY5Fu9ejVjx47l//7v/5g5c2aR/Xa7veD/c3JyOHLkCP3798cwDNavX1+wLysrC1VVA3rly8vpdDJ69GhSUlJYtGgRsbGxxbZzu91YrdZSzxVM3CkpKSxfvpxrrrmG5s2bBxxf3HtVmg0bNrB9+3Yuv/xyUlNTC/59cnJyOPvss1m+fDm6rgccM3ny5IDnt9xyC0CRz1u+xMREZs2axcMPP1zi0IC0tDQOHTrEggULWLVqFQMHDizYZ7VaUVX/n16fz0dqaioRERF06NChwnpcc3NzmTZtGlOmTCnynubk5KDreon/rkLUdZL4ChEidu3axQcffMANN9xAo0aNim2zfft2DMOgXbt2xMXFBTz+/fdfDh8+HNC+cePGhIeHB2xr3749QMBY0ffee49u3bphs9mIjY0lLi6O7777joyMjCIxtGvXrsi29u3bB5wPICYmhpdffpmNGzcSHR3Nyy+/XOrr//DDD/nnn3946qmniuxr3bo1DRs25LnnnmPjxo0FCZPL5Sry/mRkZBAfH1/k/cnOzi54f3bu3AkQVKm4xMRERo0aRU5ODqmpqcUme/v27WPixInExMQQERFBXFwcgwYNAgh4D/v164eu69x2223s3LmTI0eOFDv2uDRXX301K1asICsrq8i43sLS09PLHBMaTNy7du0CgnuvyrJ9+3bAfyv/+H+fOXPm4HK5inzmjv+8tWnTBlVVi3ze8j366KM0btw44Avf8Tp37kzjxo258MILueCCC3jppZcK9um6zgsvvEC7du2wWq00aNCAuLg4Nm7cWOzPw4n43//+h9Pp5IEHHiiyLzY2lnbt2jFnzhx++uknDh8+XOxnXYi6Ssb4ChEipk+fXjC2tyS6rqMoCj/88AOaphXZfyKTX+bNm8fEiRMZM2YM99xzD/Hx8WiaxowZMwoSxBP1448/Av6xxgcOHCixp9PtdvPwww9z7bXXFiTmhVksFt566y0uv/xyunfvHrCvRYsWBf+v6zrx8fF8+OGHxV4nLi6u3K9hx44dnHrqqbzwwgt
"text/plain": [
"<Figure size 800x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from sklearn.cluster import AgglomerativeClustering\n",
"from scipy.cluster.hierarchy import dendrogram, linkage\n",
"\n",
"kmeans = KMeans(n_clusters=clusters_count, random_state=rnd_state)\n",
"kmeans_labels = kmeans.fit_predict(data_pca)\n",
"\n",
"plt.figure(figsize=(8, 6))\n",
"plt.scatter(data_pca[:, 0], data_pca[:, 1], c=kmeans_labels, cmap='viridis', alpha=0.6)\n",
"plt.title('Неиерархическая кластеризация')\n",
"plt.xlabel('PCA 1')\n",
"plt.ylabel('PCA 2')\n",
"plt.show()\n",
"\n",
"hierarchical = AgglomerativeClustering(n_clusters=clusters_count)\n",
"hierarchical_labels = hierarchical.fit_predict(data_pca)\n",
"\n",
"plt.figure(figsize=(8, 6))\n",
"linkage_matrix = linkage(data_pca, method='ward')\n",
"dendrogram(linkage_matrix, truncate_mode='level', p=5)\n",
"plt.title('Иерархическая кластеризация (дендрограмма)')\n",
"plt.show()\n",
"\n",
"plt.figure(figsize=(8, 6))\n",
"plt.scatter(data_pca[:, 0], data_pca[:, 1], c=hierarchical_labels, cmap='viridis', alpha=0.6)\n",
"plt.title('Иерархическая кластеризация')\n",
"plt.xlabel('PCA 1')\n",
"plt.ylabel('PCA 2')\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Оценка качества кластеризации коэффициентом силуэта"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Silhouette Score for KMeans: 0.7138\n",
"Silhouette Score for Hierarchical Clustering: 0.7077\n"
]
}
],
"source": [
"silhouette_kmeans = silhouette_score(data_pca, kmeans_labels, random_state=rnd_state)\n",
"silhouette_hierarchical = silhouette_score(data_pca, hierarchical_labels, random_state=rnd_state)\n",
"\n",
"print(f'Silhouette Score for KMeans: {silhouette_kmeans:.4f}')\n",
"print(f'Silhouette Score for Hierarchical Clustering: {silhouette_hierarchical:.4f}')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"С небольшим отрывом побеждает неиерархическая кластеризация (KMeans)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}