3.1 MiB
Raw Blame History

In [1]:
import pandas as pd 
df = pd.read_csv("..//static//csv//heart_2020_cleaned.csv")
print(df.columns)

display(df.head(10))
Index(['HeartDisease', 'BMI', 'Smoking', 'AlcoholDrinking', 'Stroke',
       'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex', 'AgeCategory',
       'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'SleepTime',
       'Asthma', 'KidneyDisease', 'SkinCancer'],
      dtype='object')
HeartDisease BMI Smoking AlcoholDrinking Stroke PhysicalHealth MentalHealth DiffWalking Sex AgeCategory Race Diabetic PhysicalActivity GenHealth SleepTime Asthma KidneyDisease SkinCancer
0 No 16.60 Yes No No 3.0 30.0 No Female 55-59 White Yes Yes Very good 5.0 Yes No Yes
1 No 20.34 No No Yes 0.0 0.0 No Female 80 or older White No Yes Very good 7.0 No No No
2 No 26.58 Yes No No 20.0 30.0 No Male 65-69 White Yes Yes Fair 8.0 Yes No No
3 No 24.21 No No No 0.0 0.0 No Female 75-79 White No No Good 6.0 No No Yes
4 No 23.71 No No No 28.0 0.0 Yes Female 40-44 White No Yes Very good 8.0 No No No
5 Yes 28.87 Yes No No 6.0 0.0 Yes Female 75-79 Black No No Fair 12.0 No No No
6 No 21.63 No No No 15.0 0.0 No Female 70-74 White No Yes Fair 4.0 Yes No Yes
7 No 31.64 Yes No No 5.0 0.0 Yes Female 80 or older White Yes No Good 9.0 Yes No No
8 No 26.45 No No No 0.0 0.0 No Female 80 or older White No, borderline diabetes No Fair 5.0 No Yes No
9 No 40.69 No No No 0.0 0.0 Yes Male 65-69 White No Yes Good 10.0 No No No

Сегментация пациентов по рискам сердечно-сосудистых заболеваний Цель: Определить группы пациентов с различными уровнями риска развития сердечно-сосудистых заболеваний на основе их демографических данных, образа жизни и состояния здоровья.

Пример:

Кластер 1: Пациенты с высоким риском (курение, высокий ИМТ, низкая физическая активность).

Кластер 2: Пациенты со средним риском (умеренное курение, средний ИМТ, средняя физическая активность).

Кластер 3: Пациенты с низким риском (отсутствие вредных привычек, нормальный ИМТ, высокая физическая активность).

Бизнес-применение:

Разработка персонализированных программ профилактики и лечения для каждой группы пациентов.

Таргетированная реклама медицинских услуг и продуктов для улучшения образа жизни.

In [ ]:
import pandas as pd

df = pd.read_csv("..//static//csv//heart_2020_cleaned.csv").head(1000)
df = df.dropna()
from scipy import stats

z_scores = stats.zscore(df[['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime']])
abs_z_scores = abs(z_scores)
filtered_entries = (abs_z_scores < 3).all(axis=1)
df = df[filtered_entries]
from sklearn.preprocessing import LabelEncoder

label_encoders = {}

for column in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)

X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']

X_resampled, y_resampled = ros.fit_resample(X, y)

df_balanced = X_resampled.copy()
df_balanced['HeartDisease'] = y_resampled

df = df_balanced
In [ ]:
scaler = StandardScaler()
df_encoded[numeric_features] = scaler.fit_transform(df_encoded[numeric_features])
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[3], line 2
      1 ## стандартизация
----> 2 scaler = StandardScaler()
      3 df_encoded[numeric_features] = scaler.fit_transform(df_encoded[numeric_features])

NameError: name 'StandardScaler' is not defined
In [ ]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(16, 12))

plt.subplot(2, 2, 1)
sns.scatterplot(x='BMI', y='PhysicalHealth', hue='HeartDisease', data=df)
plt.title('BMI vs PhysicalHealth')

plt.subplot(2, 2, 2)
sns.scatterplot(x='MentalHealth', y='SleepTime', hue='HeartDisease', data=df)
plt.title('MentalHealth vs SleepTime')

plt.subplot(2, 2, 3)
sns.scatterplot(x='PhysicalHealth', y='SleepTime', hue='HeartDisease', data=df)
plt.title('PhysicalHealth vs SleepTime')

plt.subplot(2, 2, 4)
sns.scatterplot(x='BMI', y='MentalHealth', hue='HeartDisease', data=df)
plt.title('BMI vs MentalHealth')

plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
plt.figure(figsize=(16, 12))

plt.subplot(2, 2, 1)
sns.scatterplot(x='BMI', y='PhysicalHealth', data=df)
plt.title('BMI vs PhysicalHealth')

plt.subplot(2, 2, 2)
sns.scatterplot(x='MentalHealth', y='SleepTime', data=df)
plt.title('MentalHealth vs SleepTime')

plt.subplot(2, 2, 3)
sns.scatterplot(x='PhysicalHealth', y='SleepTime', data=df)
plt.title('PhysicalHealth vs SleepTime')

plt.subplot(2, 2, 4)
sns.scatterplot(x='BMI', y='MentalHealth', data=df)
plt.title('BMI vs MentalHealth')

plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt

X = df[['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime']]

agg_clustering = AgglomerativeClustering(n_clusters=3)
clusters = agg_clustering.fit_predict(X)

df['Cluster'] = clusters

linked = linkage(X, 'ward')

plt.figure(figsize=(10, 7))
dendrogram(linked,
           orientation='top',
           distance_sort='descending',
           show_leaf_counts=True)
plt.title('Dendrogram')
plt.show()
No description has been provided for this image
In [ ]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import MiniBatchKMeans
import matplotlib.pyplot as plt

def draw_data_2d(data, feature_x, feature_y, labels, subplot):
    subplot.scatter(data.iloc[:, feature_x], data.iloc[:, feature_y], c=labels, cmap='viridis')
    subplot.set_xlabel(data.columns[feature_x])
    subplot.set_ylabel(data.columns[feature_y])

X = df[['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime']]

kmeans = MiniBatchKMeans(n_clusters=3, random_state=0, batch_size=100)
result = kmeans.fit_predict(X)

plt.figure(figsize=(16, 24))

plt.subplot(4, 2, 1)
draw_data_2d(X, 0, 1, result, plt.subplot(4, 2, 1))
plt.title('Clusters (BMI vs PhysicalHealth)')

plt.subplot(4, 2, 2)
draw_data_2d(X, 0, 1, df['HeartDisease'], plt.subplot(4, 2, 2))
plt.title('True Labels (BMI vs PhysicalHealth)')

plt.subplot(4, 2, 3)
draw_data_2d(X, 2, 3, result, plt.subplot(4, 2, 3))
plt.title('Clusters (MentalHealth vs SleepTime)')

plt.subplot(4, 2, 4)
draw_data_2d(X, 2, 3, df['HeartDisease'], plt.subplot(4, 2, 4))
plt.title('True Labels (MentalHealth vs SleepTime)')

plt.subplot(4, 2, 5)
draw_data_2d(X, 0, 2, result, plt.subplot(4, 2, 5))
plt.title('Clusters (BMI vs MentalHealth)')

plt.subplot(4, 2, 6)
draw_data_2d(X, 0, 2, df['HeartDisease'], plt.subplot(4, 2, 6))
plt.title('True Labels (BMI vs MentalHealth)')

plt.subplot(4, 2, 7)
draw_data_2d(X, 1, 3, result, plt.subplot(4, 2, 7))
plt.title('Clusters (PhysicalHealth vs SleepTime)')

plt.subplot(4, 2, 8)
draw_data_2d(X, 1, 3, df['HeartDisease'], plt.subplot(4, 2, 8))
plt.title('True Labels (PhysicalHealth vs SleepTime)')

plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

def run_kmeans(data, n_clusters, random_state):
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
    labels = kmeans.fit_predict(data)
    centers = kmeans.cluster_centers_
    return labels, centers

def print_cluster_result(data, n_clusters, labels):
    for i in range(n_clusters):
        cluster_indices = [index for index, label in enumerate(labels) if label == i]
        print(f"Cluster {i+1} ({len(cluster_indices)}):")
        print(", ".join(map(str, cluster_indices)))
        print("--------")

def draw_cluster_results(data, feature_x, feature_y, labels, centers, subplot):
    subplot.scatter(data.iloc[:, feature_x], data.iloc[:, feature_y], c=labels, cmap='viridis')
    subplot.scatter(centers[:, feature_x], centers[:, feature_y], marker='x', s=200, linewidths=3, color='r')
    subplot.set_xlabel(data.columns[feature_x])
    subplot.set_ylabel(data.columns[feature_y])

random_state = 9
n_clusters = 3
labels, centers = run_kmeans(X, n_clusters, random_state)
print_cluster_result(X, n_clusters, labels)
display(centers)
display(df['HeartDisease'])

plt.figure(figsize=(16, 12))
draw_cluster_results(X, 0, 1, labels, centers, plt.subplot(2, 2, 1))
draw_cluster_results(X, 2, 3, labels, centers, plt.subplot(2, 2, 2))
draw_cluster_results(X, 0, 2, labels, centers, plt.subplot(2, 2, 3))
draw_cluster_results(X, 1, 3, labels, centers, plt.subplot(2, 2, 4))
plt.tight_layout()
plt.show()
Cluster 1 (120):
4, 6, 10, 26, 32, 44, 49, 54, 57, 65, 66, 71, 76, 80, 81, 90, 97, 105, 119, 121, 130, 155, 157, 166, 206, 218, 219, 239, 240, 247, 263, 267, 278, 279, 281, 284, 288, 293, 297, 306, 315, 317, 323, 326, 333, 335, 343, 346, 355, 378, 398, 402, 406, 407, 410, 418, 431, 443, 447, 453, 455, 469, 499, 516, 539, 543, 544, 545, 551, 559, 570, 575, 581, 586, 597, 601, 602, 605, 609, 615, 616, 618, 619, 622, 629, 647, 650, 653, 659, 668, 680, 686, 689, 703, 718, 725, 728, 731, 732, 743, 749, 760, 789, 818, 833, 840, 857, 866, 888, 906, 915, 930, 939, 945, 948, 964, 969, 976, 990, 998
--------
Cluster 2 (796):
1, 3, 5, 7, 8, 9, 11, 12, 13, 15, 16, 18, 19, 20, 21, 22, 24, 27, 28, 29, 30, 31, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 45, 46, 47, 48, 50, 51, 52, 53, 55, 56, 58, 59, 60, 61, 62, 63, 64, 67, 68, 69, 70, 72, 73, 74, 75, 77, 79, 82, 83, 85, 86, 87, 88, 89, 91, 92, 93, 94, 95, 96, 98, 99, 100, 102, 103, 104, 107, 108, 109, 110, 111, 112, 113, 114, 116, 117, 118, 120, 122, 123, 124, 125, 127, 128, 129, 131, 132, 133, 134, 135, 137, 140, 141, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 158, 159, 160, 161, 163, 164, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 189, 190, 191, 192, 193, 194, 196, 197, 198, 199, 201, 202, 203, 204, 205, 207, 208, 209, 210, 211, 212, 213, 214, 215, 217, 220, 221, 222, 223, 224, 225, 226, 227, 228, 230, 232, 233, 234, 235, 236, 237, 238, 241, 243, 244, 245, 246, 248, 249, 251, 252, 253, 255, 256, 257, 258, 259, 260, 261, 262, 264, 265, 268, 269, 271, 274, 275, 276, 277, 280, 282, 283, 285, 286, 287, 289, 291, 292, 294, 295, 298, 299, 300, 301, 302, 303, 304, 305, 307, 308, 309, 310, 311, 312, 313, 314, 316, 318, 319, 320, 321, 322, 324, 325, 327, 328, 329, 330, 331, 332, 334, 337, 338, 339, 340, 341, 342, 344, 345, 347, 348, 349, 351, 352, 353, 354, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 379, 380, 381, 382, 383, 385, 386, 387, 389, 390, 391, 392, 393, 394, 395, 396, 397, 399, 400, 403, 405, 408, 409, 411, 412, 413, 414, 415, 416, 417, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 444, 445, 446, 448, 449, 450, 452, 454, 456, 457, 458, 460, 461, 462, 463, 464, 465, 466, 467, 468, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 491, 492, 493, 494, 495, 496, 497, 498, 500, 501, 502, 505, 506, 507, 508, 509, 510, 512, 513, 514, 515, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 540, 541, 542, 546, 547, 548, 549, 550, 552, 554, 555, 556, 557, 558, 560, 561, 562, 563, 565, 566, 567, 569, 571, 573, 574, 576, 577, 578, 579, 580, 582, 583, 584, 585, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 598, 599, 600, 604, 607, 608, 610, 611, 612, 613, 614, 617, 620, 621, 623, 624, 626, 627, 628, 630, 631, 632, 633, 634, 635, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 648, 649, 651, 652, 654, 655, 656, 658, 660, 661, 662, 663, 665, 666, 667, 669, 670, 671, 672, 673, 674, 675, 676, 677, 678, 679, 681, 682, 683, 684, 685, 687, 688, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 701, 702, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 717, 719, 720, 721, 722, 723, 724, 726, 729, 730, 733, 734, 735, 736, 737, 739, 740, 741, 742, 744, 745, 746, 748, 751, 752, 753, 754, 755, 756, 757, 759, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 775, 776, 777, 778, 779, 780, 781, 782, 784, 785, 786, 787, 788, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799, 800, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, 832, 834, 835, 836, 837, 839, 841, 842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 858, 859, 860, 861, 862, 863, 864, 865, 867, 869, 870, 871, 872, 873, 874, 875, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 889, 890, 892, 893, 894, 895, 898, 899, 900, 901, 902, 903, 904, 905, 907, 909, 910, 911, 912, 913, 914, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 928, 929, 931, 933, 934, 935, 936, 938, 940, 941, 942, 943, 944, 946, 947, 949, 950, 951, 952, 953, 954, 955, 956, 959, 961, 962, 963, 965, 967, 970, 971, 972, 973, 974, 975, 978, 979, 980, 981, 982, 983, 984, 985, 986, 988, 989, 991, 992, 993, 994, 996, 997, 999
--------
Cluster 3 (84):
0, 2, 14, 17, 23, 25, 78, 84, 101, 106, 115, 126, 136, 138, 139, 142, 156, 162, 165, 188, 195, 200, 216, 229, 231, 242, 250, 254, 266, 270, 272, 273, 290, 296, 336, 350, 384, 388, 401, 404, 430, 451, 459, 490, 503, 504, 511, 538, 553, 564, 568, 572, 603, 606, 625, 636, 657, 664, 700, 727, 738, 747, 750, 758, 774, 783, 801, 838, 868, 876, 891, 896, 897, 908, 932, 937, 957, 958, 960, 966, 968, 977, 987, 995
--------
array([[31.04766667, 24.575     ,  2.90833333,  7.21666667],
       [28.68121859,  1.00125628,  0.97361809,  7.33165829],
       [30.42404762, 10.45238095, 26.0952381 ,  6.42857143]])
0      0
1      0
2      0
3      0
4      0
      ..
995    0
996    0
997    1
998    0
999    0
Name: HeartDisease, Length: 1000, dtype: int64
No description has been provided for this image
In [ ]:
from sklearn.decomposition import PCA

reduced_data = PCA(n_components=2).fit_transform(X)

print(reduced_data)
[[  9.31321501  26.77646868]
 [ -6.91783938   1.05243249]
 [ 25.54665344  17.10779298]
 ...
 [ -5.04837552  -0.86215887]
 [ 22.80197652 -17.26780336]
 [ -6.07756748  -0.15069417]]
In [ ]:
import matplotlib.pyplot as plt
import pandas as pd

plt.figure(figsize=(16, 6))

draw_data_2d(
    pd.DataFrame({"Column1": reduced_data[:, 0], "Column2": reduced_data[:, 1]}),
    0,
    1,
    labels=[0] * len(reduced_data),  
    subplot=plt.subplot(1, 2, 1),
)
plt.title('PCA Data (Without True Labels)')

draw_data_2d(
    pd.DataFrame({"Column1": reduced_data[:, 0], "Column2": reduced_data[:, 1]}),
    0,
    1,
    df['HeartDisease'],
    plt.subplot(1, 2, 2),
)
plt.title('PCA Data (With True Labels)')

plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

def fit_kmeans(data, n_clusters, random_state):
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
    kmeans.fit(data)
    return kmeans

def draw_clusters(data, kmeans):
    labels = kmeans.labels_
    centers = kmeans.cluster_centers_
    
    plt.figure(figsize=(10, 6))
    plt.scatter(data[:, 0], data[:, 1], c=labels, cmap='viridis', alpha=0.6)
    plt.scatter(centers[:, 0], centers[:, 1], marker='x', s=200, linewidths=3, color='r')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.title('K-means Clustering (2 Clusters) with PCA')
    plt.show()

kmeans = fit_kmeans(reduced_data, 2, random_state)
draw_clusters(reduced_data, kmeans)
No description has been provided for this image
In [ ]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

def fit_kmeans(data, n_clusters, random_state):
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
    kmeans.fit(data)
    return kmeans

def draw_clusters(data, kmeans):
    labels = kmeans.labels_
    centers = kmeans.cluster_centers_
    
    plt.figure(figsize=(10, 6))
    plt.scatter(data[:, 0], data[:, 1], c=labels, cmap='viridis', alpha=0.6)
    plt.scatter(centers[:, 0], centers[:, 1], marker='x', s=200, linewidths=3, color='white')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.title('K-means Clustering (PCA-reduced data)')
    plt.show()

kmeans = fit_kmeans(reduced_data, 3, random_state)
draw_clusters(reduced_data, kmeans)
No description has been provided for this image
In [ ]:
import matplotlib.pyplot as plt
import pandas as pd

labels = [2 if val == 1 else 1 if val == 2 else val for val in kmeans.labels_]

plt.figure(figsize=(16, 6))

draw_data_2d(
    pd.DataFrame({"Column1": reduced_data[:, 0], "Column2": reduced_data[:, 1]}),
    0,
    1,
    labels,
    plt.subplot(1, 2, 1),
)
plt.title('Clusters (PCA-reduced data)')

draw_data_2d(
    pd.DataFrame({"Column1": reduced_data[:, 0], "Column2": reduced_data[:, 1]}),
    0,
    1,
    df['HeartDisease'],
    plt.subplot(1, 2, 2),
)
plt.title('True Labels (PCA-reduced data)')

plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

def get_clusters_inertia(data, random_state, max_clusters=10):
    inertias = []
    clusters_range = range(1, max_clusters + 1)
    
    for n_clusters in clusters_range:
        kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
        kmeans.fit(data)
        inertias.append(kmeans.inertia_)
    
    return inertias, clusters_range

def draw_elbow_diagram(inertias, clusters_range):
    plt.figure(figsize=(10, 6))
    plt.plot(clusters_range, inertias, marker='o')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Inertia')
    plt.title('Elbow Method for Optimal Number of Clusters')
    plt.show()

random_state = 42
max_clusters = 10

inertias, clusters_range = get_clusters_inertia(reduced_data, random_state, max_clusters)

print("Clusters Range:", list(clusters_range))
print("Inertias:", inertias)

draw_elbow_diagram(inertias, clusters_range)
Clusters Range: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
Inertias: [40898793.73610622, 20383039.394992765, 10558797.359806543, 8253581.349853665, 5211677.9440700915, 4725347.803894365, 3415392.2695708266, 2746581.745507652, 2327078.2781427936, 2038724.3739189352]
No description has been provided for this image
In [30]:
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np

def get_clusters_silhouette_scores(data, random_state, max_clusters=10):
    silhouette_scores = []
    clusters_range = range(2, max_clusters + 1)
    
    for n_clusters in clusters_range:
        kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
        labels = kmeans.fit_predict(data)
        score = silhouette_score(data, labels)
        silhouette_scores.append(score)
    
    return silhouette_scores, clusters_range

def draw_silhouettes_diagram(silhouette_scores, clusters_range):
    plt.figure(figsize=(10, 6))
    plt.plot(clusters_range, silhouette_scores, marker='o')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Silhouette Score')
    plt.title('Silhouette Analysis for Optimal Number of Clusters')
    plt.show()

random_state = 42
max_clusters = 10

scaler = StandardScaler()
reduced_data = scaler.fit_transform(reduced_data)

silhouette_scores, clusters_range = get_clusters_silhouette_scores(reduced_data, random_state, max_clusters)

print("Clusters Range:", list(clusters_range))
print("Silhouette Scores:", silhouette_scores)

draw_silhouettes_diagram(silhouette_scores, clusters_range)
Clusters Range: [2, 3, 4, 5, 6, 7, 8, 9, 10]
Silhouette Scores: [np.float64(0.6907065667264102), np.float64(0.7357946226010675), np.float64(0.7471282832991772), np.float64(0.6611810724468269), np.float64(0.46906053559429145), np.float64(0.514038817014376), np.float64(0.5134186364121586), np.float64(0.5108072647247368), np.float64(0.5339567370287389)]
No description has been provided for this image
In [33]:
import math
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import KMeans
import numpy as np

def get_clusters_silhouettes(data, random_state):
    max_clusters = min(12, int(math.sqrt(len(data))))
    silhouettes = []
    
    for n_clusters in range(2, max_clusters + 1):
        kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
        cluster_labels = kmeans.fit_predict(data)
        silhouette_avg = silhouette_score(data, cluster_labels)
        silhouettes.append((n_clusters, silhouette_avg, cluster_labels, kmeans.cluster_centers_))
    
    return silhouettes

def draw_silhouettes(data, silhouettes):
    for n_clusters, silhouette_avg, cluster_labels, centers in silhouettes:
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
        
        ax1.set_xlim([-0.1, 1])
        ax1.set_ylim([0, len(data) + (n_clusters + 1) * 10])
        silhouette_values = silhouette_samples(data, cluster_labels)
        y_lower = 10
        
        for i in range(n_clusters):
            cluster_silhouette_values = silhouette_values[cluster_labels == i]
            cluster_silhouette_values.sort()
            cluster_size = cluster_silhouette_values.shape[0]
            y_upper = y_lower + cluster_size
            ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, cluster_silhouette_values, alpha=0.7)
            ax1.text(-0.05, y_lower + 0.5 * cluster_size, str(i))
            y_lower = y_upper + 10
        
        ax1.set_title("Silhouette plot for {} clusters".format(n_clusters))
        ax1.set_xlabel("Silhouette coefficient values")
        ax1.set_ylabel("Cluster label")
        ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
        ax1.set_yticks([])
        
        ax2.scatter(data[:, 0], data[:, 1], c=cluster_labels, cmap='viridis', s=30)
        ax2.scatter(centers[:, 0], centers[:, 1], marker='x', c='red', s=200, alpha=1)
        ax2.set_title("Clustered data for {} clusters".format(n_clusters))
        ax2.set_xlabel("Feature space for the 1st feature")
        ax2.set_ylabel("Feature space for the 2nd feature")
        
        plt.suptitle(("Silhouette analysis for KMeans clustering with {} clusters".format(n_clusters)),
                     fontsize=14, fontweight='bold')
        plt.show()

random_state = 42

silhouettes = get_clusters_silhouettes(reduced_data, random_state)

draw_silhouettes(reduced_data, silhouettes)
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image