AIM-PIbd-32-Chubykina-P-P/lab5.ipynb at 3b1a445d5e5f4bbe2a0fdf8467d971615ec80f94

chubykina_polina/AIM-PIbd-32-Chubykina-P-P

Полина Чубыкина 3b1a445d5e лаба 5 почти почти готово

2024-12-12 18:50:47 +04:00

3.1 MiB

Raw Blame History

In [1]:

import pandas as pd 
df = pd.read_csv("..//static//csv//heart_2020_cleaned.csv")
print(df.columns)

display(df.head(10))

Index(['HeartDisease', 'BMI', 'Smoking', 'AlcoholDrinking', 'Stroke',
       'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex', 'AgeCategory',
       'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'SleepTime',
       'Asthma', 'KidneyDisease', 'SkinCancer'],
      dtype='object')

	HeartDisease	BMI	Smoking	AlcoholDrinking	Stroke	PhysicalHealth	MentalHealth	DiffWalking	Sex	AgeCategory	Race	Diabetic	PhysicalActivity	GenHealth	SleepTime	Asthma	KidneyDisease	SkinCancer
0	No	16.60	Yes	No	No	3.0	30.0	No	Female	55-59	White	Yes	Yes	Very good	5.0	Yes	No	Yes
1	No	20.34	No	No	Yes	0.0	0.0	No	Female	80 or older	White	No	Yes	Very good	7.0	No	No	No
2	No	26.58	Yes	No	No	20.0	30.0	No	Male	65-69	White	Yes	Yes	Fair	8.0	Yes	No	No
3	No	24.21	No	No	No	0.0	0.0	No	Female	75-79	White	No	No	Good	6.0	No	No	Yes
4	No	23.71	No	No	No	28.0	0.0	Yes	Female	40-44	White	No	Yes	Very good	8.0	No	No	No
5	Yes	28.87	Yes	No	No	6.0	0.0	Yes	Female	75-79	Black	No	No	Fair	12.0	No	No	No
6	No	21.63	No	No	No	15.0	0.0	No	Female	70-74	White	No	Yes	Fair	4.0	Yes	No	Yes
7	No	31.64	Yes	No	No	5.0	0.0	Yes	Female	80 or older	White	Yes	No	Good	9.0	Yes	No	No
8	No	26.45	No	No	No	0.0	0.0	No	Female	80 or older	White	No, borderline diabetes	No	Fair	5.0	No	Yes	No
9	No	40.69	No	No	No	0.0	0.0	Yes	Male	65-69	White	No	Yes	Good	10.0	No	No	No

Сегментация пациентов по рискам сердечно-сосудистых заболеваний Цель: Определить группы пациентов с различными уровнями риска развития сердечно-сосудистых заболеваний на основе их демографических данных, образа жизни и состояния здоровья.

Пример:

Кластер 1: Пациенты с высоким риском (курение, высокий ИМТ, низкая физическая активность).

Кластер 2: Пациенты со средним риском (умеренное курение, средний ИМТ, средняя физическая активность).

Кластер 3: Пациенты с низким риском (отсутствие вредных привычек, нормальный ИМТ, высокая физическая активность).

Бизнес-применение:

Разработка персонализированных программ профилактики и лечения для каждой группы пациентов.

Таргетированная реклама медицинских услуг и продуктов для улучшения образа жизни.

In [ ]:

import pandas as pd

df = pd.read_csv("..//static//csv//heart_2020_cleaned.csv").head(1000)
df = df.dropna()
from scipy import stats

z_scores = stats.zscore(df[['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime']])
abs_z_scores = abs(z_scores)
filtered_entries = (abs_z_scores < 3).all(axis=1)
df = df[filtered_entries]
from sklearn.preprocessing import LabelEncoder

label_encoders = {}

for column in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)

X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']

X_resampled, y_resampled = ros.fit_resample(X, y)

df_balanced = X_resampled.copy()
df_balanced['HeartDisease'] = y_resampled

df = df_balanced

In [ ]:

scaler = StandardScaler()
df_encoded[numeric_features] = scaler.fit_transform(df_encoded[numeric_features])

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[3], line 2
      1 ## стандартизация
----> 2 scaler = StandardScaler()
      3 df_encoded[numeric_features] = scaler.fit_transform(df_encoded[numeric_features])

NameError: name 'StandardScaler' is not defined

In [ ]:

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(16, 12))

plt.subplot(2, 2, 1)
sns.scatterplot(x='BMI', y='PhysicalHealth', hue='HeartDisease', data=df)
plt.title('BMI vs PhysicalHealth')

plt.subplot(2, 2, 2)
sns.scatterplot(x='MentalHealth', y='SleepTime', hue='HeartDisease', data=df)
plt.title('MentalHealth vs SleepTime')

plt.subplot(2, 2, 3)
sns.scatterplot(x='PhysicalHealth', y='SleepTime', hue='HeartDisease', data=df)
plt.title('PhysicalHealth vs SleepTime')

plt.subplot(2, 2, 4)
sns.scatterplot(x='BMI', y='MentalHealth', hue='HeartDisease', data=df)
plt.title('BMI vs MentalHealth')

plt.tight_layout()
plt.show()

No description has been provided for this image

In [ ]:

plt.figure(figsize=(16, 12))

plt.subplot(2, 2, 1)
sns.scatterplot(x='BMI', y='PhysicalHealth', data=df)
plt.title('BMI vs PhysicalHealth')

plt.subplot(2, 2, 2)
sns.scatterplot(x='MentalHealth', y='SleepTime', data=df)
plt.title('MentalHealth vs SleepTime')

plt.subplot(2, 2, 3)
sns.scatterplot(x='PhysicalHealth', y='SleepTime', data=df)
plt.title('PhysicalHealth vs SleepTime')

plt.subplot(2, 2, 4)
sns.scatterplot(x='BMI', y='MentalHealth', data=df)
plt.title('BMI vs MentalHealth')

plt.tight_layout()
plt.show()

In [ ]:

from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt

X = df[['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime']]

agg_clustering = AgglomerativeClustering(n_clusters=3)
clusters = agg_clustering.fit_predict(X)

df['Cluster'] = clusters

linked = linkage(X, 'ward')

plt.figure(figsize=(10, 7))
dendrogram(linked,
           orientation='top',
           distance_sort='descending',
           show_leaf_counts=True)
plt.title('Dendrogram')
plt.show()

In [ ]:

import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import MiniBatchKMeans
import matplotlib.pyplot as plt

def draw_data_2d(data, feature_x, feature_y, labels, subplot):
    subplot.scatter(data.iloc[:, feature_x], data.iloc[:, feature_y], c=labels, cmap='viridis')
    subplot.set_xlabel(data.columns[feature_x])
    subplot.set_ylabel(data.columns[feature_y])

X = df[['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime']]

kmeans = MiniBatchKMeans(n_clusters=3, random_state=0, batch_size=100)
result = kmeans.fit_predict(X)

plt.figure(figsize=(16, 24))

plt.subplot(4, 2, 1)
draw_data_2d(X, 0, 1, result, plt.subplot(4, 2, 1))
plt.title('Clusters (BMI vs PhysicalHealth)')

plt.subplot(4, 2, 2)
draw_data_2d(X, 0, 1, df['HeartDisease'], plt.subplot(4, 2, 2))
plt.title('True Labels (BMI vs PhysicalHealth)')

plt.subplot(4, 2, 3)
draw_data_2d(X, 2, 3, result, plt.subplot(4, 2, 3))
plt.title('Clusters (MentalHealth vs SleepTime)')

plt.subplot(4, 2, 4)
draw_data_2d(X, 2, 3, df['HeartDisease'], plt.subplot(4, 2, 4))
plt.title('True Labels (MentalHealth vs SleepTime)')

plt.subplot(4, 2, 5)
draw_data_2d(X, 0, 2, result, plt.subplot(4, 2, 5))
plt.title('Clusters (BMI vs MentalHealth)')

plt.subplot(4, 2, 6)
draw_data_2d(X, 0, 2, df['HeartDisease'], plt.subplot(4, 2, 6))
plt.title('True Labels (BMI vs MentalHealth)')

plt.subplot(4, 2, 7)
draw_data_2d(X, 1, 3, result, plt.subplot(4, 2, 7))
plt.title('Clusters (PhysicalHealth vs SleepTime)')

plt.subplot(4, 2, 8)
draw_data_2d(X, 1, 3, df['HeartDisease'], plt.subplot(4, 2, 8))
plt.title('True Labels (PhysicalHealth vs SleepTime)')

plt.tight_layout()
plt.show()

In [ ]:

from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

def run_kmeans(data, n_clusters, random_state):
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
    labels = kmeans.fit_predict(data)
    centers = kmeans.cluster_centers_
    return labels, centers

def print_cluster_result(data, n_clusters, labels):
    for i in range(n_clusters):
        cluster_indices = [index for index, label in enumerate(labels) if label == i]
        print(f"Cluster {i+1} ({len(cluster_indices)}):")
        print(", ".join(map(str, cluster_indices)))
        print("--------")

def draw_cluster_results(data, feature_x, feature_y, labels, centers, subplot):
    subplot.scatter(data.iloc[:, feature_x], data.iloc[:, feature_y], c=labels, cmap='viridis')
    subplot.scatter(centers[:, feature_x], centers[:, feature_y], marker='x', s=200, linewidths=3, color='r')
    subplot.set_xlabel(data.columns[feature_x])
    subplot.set_ylabel(data.columns[feature_y])

random_state = 9
n_clusters = 3
labels, centers = run_kmeans(X, n_clusters, random_state)
print_cluster_result(X, n_clusters, labels)
display(centers)
display(df['HeartDisease'])

plt.figure(figsize=(16, 12))
draw_cluster_results(X, 0, 1, labels, centers, plt.subplot(2, 2, 1))
draw_cluster_results(X, 2, 3, labels, centers, plt.subplot(2, 2, 2))
draw_cluster_results(X, 0, 2, labels, centers, plt.subplot(2, 2, 3))
draw_cluster_results(X, 1, 3, labels, centers, plt.subplot(2, 2, 4))
plt.tight_layout()
plt.show()

Cluster 1 (120):
4, 6, 10, 26, 32, 44, 49, 54, 57, 65, 66, 71, 76, 80, 81, 90, 97, 105, 119, 121, 130, 155, 157, 166, 206, 218, 219, 239, 240, 247, 263, 267, 278, 279, 281, 284, 288, 293, 297, 306, 315, 317, 323, 326, 333, 335, 343, 346, 355, 378, 398, 402, 406, 407, 410, 418, 431, 443, 447, 453, 455, 469, 499, 516, 539, 543, 544, 545, 551, 559, 570, 575, 581, 586, 597, 601, 602, 605, 609, 615, 616, 618, 619, 622, 629, 647, 650, 653, 659, 668, 680, 686, 689, 703, 718, 725, 728, 731, 732, 743, 749, 760, 789, 818, 833, 840, 857, 866, 888, 906, 915, 930, 939, 945, 948, 964, 969, 976, 990, 998
--------
Cluster 2 (796):
1, 3, 5, 7, 8, 9, 11, 12, 13, 15, 16, 18, 19, 20, 21, 22, 24, 27, 28, 29, 30, 31, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 45, 46, 47, 48, 50, 51, 52, 53, 55, 56, 58, 59, 60, 61, 62, 63, 64, 67, 68, 69, 70, 72, 73, 74, 75, 77, 79, 82, 83, 85, 86, 87, 88, 89, 91, 92, 93, 94, 95, 96, 98, 99, 100, 102, 103, 104, 107, 108, 109, 110, 111, 112, 113, 114, 116, 117, 118, 120, 122, 123, 124, 125, 127, 128, 129, 131, 132, 133, 134, 135, 137, 140, 141, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 158, 159, 160, 161, 163, 164, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 189, 190, 191, 192, 193, 194, 196, 197, 198, 199, 201, 202, 203, 204, 205, 207, 208, 209, 210, 211, 212, 213, 214, 215, 217, 220, 221, 222, 223, 224, 225, 226, 227, 228, 230, 232, 233, 234, 235, 236, 237, 238, 241, 243, 244, 245, 246, 248, 249, 251, 252, 253, 255, 256, 257, 258, 259, 260, 261, 262, 264, 265, 268, 269, 271, 274, 275, 276, 277, 280, 282, 283, 285, 286, 287, 289, 291, 292, 294, 295, 298, 299, 300, 301, 302, 303, 304, 305, 307, 308, 309, 310, 311, 312, 313, 314, 316, 318, 319, 320, 321, 322, 324, 325, 327, 328, 329, 330, 331, 332, 334, 337, 338, 339, 340, 341, 342, 344, 345, 347, 348, 349, 351, 352, 353, 354, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 379, 380, 381, 382, 383, 385, 386, 387, 389, 390, 391, 392, 393, 394, 395, 396, 397, 399, 400, 403, 405, 408, 409, 411, 412, 413, 414, 415, 416, 417, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 444, 445, 446, 448, 449, 450, 452, 454, 456, 457, 458, 460, 461, 462, 463, 464, 465, 466, 467, 468, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 491, 492, 493, 494, 495, 496, 497, 498, 500, 501, 502, 505, 506, 507, 508, 509, 510, 512, 513, 514, 515, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 540, 541, 542, 546, 547, 548, 549, 550, 552, 554, 555, 556, 557, 558, 560, 561, 562, 563, 565, 566, 567, 569, 571, 573, 574, 576, 577, 578, 579, 580, 582, 583, 584, 585, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 598, 599, 600, 604, 607, 608, 610, 611, 612, 613, 614, 617, 620, 621, 623, 624, 626, 627, 628, 630, 631, 632, 633, 634, 635, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 648, 649, 651, 652, 654, 655, 656, 658, 660, 661, 662, 663, 665, 666, 667, 669, 670, 671, 672, 673, 674, 675, 676, 677, 678, 679, 681, 682, 683, 684, 685, 687, 688, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 701, 702, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 717, 719, 720, 721, 722, 723, 724, 726, 729, 730, 733, 734, 735, 736, 737, 739, 740, 741, 742, 744, 745, 746, 748, 751, 752, 753, 754, 755, 756, 757, 759, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 775, 776, 777, 778, 779, 780, 781, 782, 784, 785, 786, 787, 788, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799, 800, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, 832, 834, 835, 836, 837, 839, 841, 842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 858, 859, 860, 861, 862, 863, 864, 865, 867, 869, 870, 871, 872, 873, 874, 875, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 889, 890, 892, 893, 894, 895, 898, 899, 900, 901, 902, 903, 904, 905, 907, 909, 910, 911, 912, 913, 914, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 928, 929, 931, 933, 934, 935, 936, 938, 940, 941, 942, 943, 944, 946, 947, 949, 950, 951, 952, 953, 954, 955, 956, 959, 961, 962, 963, 965, 967, 970, 971, 972, 973, 974, 975, 978, 979, 980, 981, 982, 983, 984, 985, 986, 988, 989, 991, 992, 993, 994, 996, 997, 999
--------
Cluster 3 (84):
0, 2, 14, 17, 23, 25, 78, 84, 101, 106, 115, 126, 136, 138, 139, 142, 156, 162, 165, 188, 195, 200, 216, 229, 231, 242, 250, 254, 266, 270, 272, 273, 290, 296, 336, 350, 384, 388, 401, 404, 430, 451, 459, 490, 503, 504, 511, 538, 553, 564, 568, 572, 603, 606, 625, 636, 657, 664, 700, 727, 738, 747, 750, 758, 774, 783, 801, 838, 868, 876, 891, 896, 897, 908, 932, 937, 957, 958, 960, 966, 968, 977, 987, 995
--------

array([[31.04766667, 24.575     ,  2.90833333,  7.21666667],
       [28.68121859,  1.00125628,  0.97361809,  7.33165829],
       [30.42404762, 10.45238095, 26.0952381 ,  6.42857143]])

0      0
1      0
2      0
3      0
4      0
      ..
995    0
996    0
997    1
998    0
999    0
Name: HeartDisease, Length: 1000, dtype: int64

In [ ]:

from sklearn.decomposition import PCA

reduced_data = PCA(n_components=2).fit_transform(X)

print(reduced_data)

[[  9.31321501  26.77646868]
 [ -6.91783938   1.05243249]
 [ 25.54665344  17.10779298]
 ...
 [ -5.04837552  -0.86215887]
 [ 22.80197652 -17.26780336]
 [ -6.07756748  -0.15069417]]

In [ ]:

import matplotlib.pyplot as plt
import pandas as pd

plt.figure(figsize=(16, 6))

draw_data_2d(
    pd.DataFrame({"Column1": reduced_data[:, 0], "Column2": reduced_data[:, 1]}),
    0,
    1,
    labels=[0] * len(reduced_data),  
    subplot=plt.subplot(1, 2, 1),
)
plt.title('PCA Data (Without True Labels)')

draw_data_2d(
    pd.DataFrame({"Column1": reduced_data[:, 0], "Column2": reduced_data[:, 1]}),
    0,
    1,
    df['HeartDisease'],
    plt.subplot(1, 2, 2),
)
plt.title('PCA Data (With True Labels)')

plt.tight_layout()
plt.show()

In [ ]:

from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

def fit_kmeans(data, n_clusters, random_state):
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
    kmeans.fit(data)
    return kmeans

def draw_clusters(data, kmeans):
    labels = kmeans.labels_
    centers = kmeans.cluster_centers_
    
    plt.figure(figsize=(10, 6))
    plt.scatter(data[:, 0], data[:, 1], c=labels, cmap='viridis', alpha=0.6)
    plt.scatter(centers[:, 0], centers[:, 1], marker='x', s=200, linewidths=3, color='r')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.title('K-means Clustering (2 Clusters) with PCA')
    plt.show()

kmeans = fit_kmeans(reduced_data, 2, random_state)
draw_clusters(reduced_data, kmeans)

In [ ]:

from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

def fit_kmeans(data, n_clusters, random_state):
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
    kmeans.fit(data)
    return kmeans

def draw_clusters(data, kmeans):
    labels = kmeans.labels_
    centers = kmeans.cluster_centers_
    
    plt.figure(figsize=(10, 6))
    plt.scatter(data[:, 0], data[:, 1], c=labels, cmap='viridis', alpha=0.6)
    plt.scatter(centers[:, 0], centers[:, 1], marker='x', s=200, linewidths=3, color='white')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.title('K-means Clustering (PCA-reduced data)')
    plt.show()

kmeans = fit_kmeans(reduced_data, 3, random_state)
draw_clusters(reduced_data, kmeans)

In [ ]:

import matplotlib.pyplot as plt
import pandas as pd

labels = [2 if val == 1 else 1 if val == 2 else val for val in kmeans.labels_]

plt.figure(figsize=(16, 6))

draw_data_2d(
    pd.DataFrame({"Column1": reduced_data[:, 0], "Column2": reduced_data[:, 1]}),
    0,
    1,
    labels,
    plt.subplot(1, 2, 1),
)
plt.title('Clusters (PCA-reduced data)')

draw_data_2d(
    pd.DataFrame({"Column1": reduced_data[:, 0], "Column2": reduced_data[:, 1]}),
    0,
    1,
    df['HeartDisease'],
    plt.subplot(1, 2, 2),
)
plt.title('True Labels (PCA-reduced data)')

plt.tight_layout()
plt.show()

In [ ]:

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

def get_clusters_inertia(data, random_state, max_clusters=10):
    inertias = []
    clusters_range = range(1, max_clusters + 1)
    
    for n_clusters in clusters_range:
        kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
        kmeans.fit(data)
        inertias.append(kmeans.inertia_)
    
    return inertias, clusters_range

def draw_elbow_diagram(inertias, clusters_range):
    plt.figure(figsize=(10, 6))
    plt.plot(clusters_range, inertias, marker='o')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Inertia')
    plt.title('Elbow Method for Optimal Number of Clusters')
    plt.show()

random_state = 42
max_clusters = 10

inertias, clusters_range = get_clusters_inertia(reduced_data, random_state, max_clusters)

print("Clusters Range:", list(clusters_range))
print("Inertias:", inertias)

draw_elbow_diagram(inertias, clusters_range)

Clusters Range: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
Inertias: [40898793.73610622, 20383039.394992765, 10558797.359806543, 8253581.349853665, 5211677.9440700915, 4725347.803894365, 3415392.2695708266, 2746581.745507652, 2327078.2781427936, 2038724.3739189352]

In [30]:

from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np

def get_clusters_silhouette_scores(data, random_state, max_clusters=10):
    silhouette_scores = []
    clusters_range = range(2, max_clusters + 1)
    
    for n_clusters in clusters_range:
        kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
        labels = kmeans.fit_predict(data)
        score = silhouette_score(data, labels)
        silhouette_scores.append(score)
    
    return silhouette_scores, clusters_range

def draw_silhouettes_diagram(silhouette_scores, clusters_range):
    plt.figure(figsize=(10, 6))
    plt.plot(clusters_range, silhouette_scores, marker='o')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Silhouette Score')
    plt.title('Silhouette Analysis for Optimal Number of Clusters')
    plt.show()

random_state = 42
max_clusters = 10

scaler = StandardScaler()
reduced_data = scaler.fit_transform(reduced_data)

silhouette_scores, clusters_range = get_clusters_silhouette_scores(reduced_data, random_state, max_clusters)

print("Clusters Range:", list(clusters_range))
print("Silhouette Scores:", silhouette_scores)

draw_silhouettes_diagram(silhouette_scores, clusters_range)

Clusters Range: [2, 3, 4, 5, 6, 7, 8, 9, 10]
Silhouette Scores: [np.float64(0.6907065667264102), np.float64(0.7357946226010675), np.float64(0.7471282832991772), np.float64(0.6611810724468269), np.float64(0.46906053559429145), np.float64(0.514038817014376), np.float64(0.5134186364121586), np.float64(0.5108072647247368), np.float64(0.5339567370287389)]

In [33]:

import math
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import KMeans
import numpy as np

def get_clusters_silhouettes(data, random_state):
    max_clusters = min(12, int(math.sqrt(len(data))))
    silhouettes = []
    
    for n_clusters in range(2, max_clusters + 1):
        kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
        cluster_labels = kmeans.fit_predict(data)
        silhouette_avg = silhouette_score(data, cluster_labels)
        silhouettes.append((n_clusters, silhouette_avg, cluster_labels, kmeans.cluster_centers_))
    
    return silhouettes

def draw_silhouettes(data, silhouettes):
    for n_clusters, silhouette_avg, cluster_labels, centers in silhouettes:
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
        
        ax1.set_xlim([-0.1, 1])
        ax1.set_ylim([0, len(data) + (n_clusters + 1) * 10])
        silhouette_values = silhouette_samples(data, cluster_labels)
        y_lower = 10
        
        for i in range(n_clusters):
            cluster_silhouette_values = silhouette_values[cluster_labels == i]
            cluster_silhouette_values.sort()
            cluster_size = cluster_silhouette_values.shape[0]
            y_upper = y_lower + cluster_size
            ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, cluster_silhouette_values, alpha=0.7)
            ax1.text(-0.05, y_lower + 0.5 * cluster_size, str(i))
            y_lower = y_upper + 10
        
        ax1.set_title("Silhouette plot for {} clusters".format(n_clusters))
        ax1.set_xlabel("Silhouette coefficient values")
        ax1.set_ylabel("Cluster label")
        ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
        ax1.set_yticks([])
        
        ax2.scatter(data[:, 0], data[:, 1], c=cluster_labels, cmap='viridis', s=30)
        ax2.scatter(centers[:, 0], centers[:, 1], marker='x', c='red', s=200, alpha=1)
        ax2.set_title("Clustered data for {} clusters".format(n_clusters))
        ax2.set_xlabel("Feature space for the 1st feature")
        ax2.set_ylabel("Feature space for the 2nd feature")
        
        plt.suptitle(("Silhouette analysis for KMeans clustering with {} clusters".format(n_clusters)),
                     fontsize=14, fontweight='bold')
        plt.show()

random_state = 42

silhouettes = get_clusters_silhouettes(reduced_data, random_state)

draw_silhouettes(reduced_data, silhouettes)

$No description has been provided for this image$

3.1 MiB Raw Blame History Unescape Escape

3.1 MiB

Raw Blame History