In [62]:
import pandas as pd 
df = pd.read_csv("..//static//csv//heart_2020_cleaned.csv")

Index(['HeartDisease', 'BMI', 'Smoking', 'AlcoholDrinking', 'Stroke',
       'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex', 'AgeCategory',
       'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'SleepTime',
       'Asthma', 'KidneyDisease', 'SkinCancer'],
HeartDisease BMI Smoking AlcoholDrinking Stroke PhysicalHealth MentalHealth DiffWalking Sex AgeCategory Race Diabetic PhysicalActivity GenHealth SleepTime Asthma KidneyDisease SkinCancer
0 No 16.60 Yes No No 3.0 30.0 No Female 55-59 White Yes Yes Very good 5.0 Yes No Yes
1 No 20.34 No No Yes 0.0 0.0 No Female 80 or older White No Yes Very good 7.0 No No No
2 No 26.58 Yes No No 20.0 30.0 No Male 65-69 White Yes Yes Fair 8.0 Yes No No
3 No 24.21 No No No 0.0 0.0 No Female 75-79 White No No Good 6.0 No No Yes
4 No 23.71 No No No 28.0 0.0 Yes Female 40-44 White No Yes Very good 8.0 No No No
5 Yes 28.87 Yes No No 6.0 0.0 Yes Female 75-79 Black No No Fair 12.0 No No No
6 No 21.63 No No No 15.0 0.0 No Female 70-74 White No Yes Fair 4.0 Yes No Yes
7 No 31.64 Yes No No 5.0 0.0 Yes Female 80 or older White Yes No Good 9.0 Yes No No
8 No 26.45 No No No 0.0 0.0 No Female 80 or older White No, borderline diabetes No Fair 5.0 No Yes No
9 No 40.69 No No No 0.0 0.0 Yes Male 65-69 White No Yes Good 10.0 No No No

Сегментация пациентов по рискам сердечно-сосудистых заболеваний Цель: Определить группы пациентов с различными уровнями риска развития сердечно-сосудистых заболеваний на основе их демографических данных, образа жизни и состояния здоровья.


Кластер 1: Пациенты с высоким риском (курение, высокий ИМТ, низкая физическая активность).

Кластер 2: Пациенты со средним риском (умеренное курение, средний ИМТ, средняя физическая активность).

Кластер 3: Пациенты с низким риском (отсутствие вредных привычек, нормальный ИМТ, высокая физическая активность).


Разработка персонализированных программ профилактики и лечения для каждой группы пациентов.

Таргетированная реклама медицинских услуг и продуктов для улучшения образа жизни.

In [63]:
import pandas as pd

df = pd.read_csv("..//static//csv//heart_2020_cleaned.csv").head(1000)
df = df.dropna()
from scipy import stats

z_scores = stats.zscore(df[['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime']])
abs_z_scores = abs(z_scores)
filtered_entries = (abs_z_scores < 3).all(axis=1)
df = df[filtered_entries]
from sklearn.preprocessing import LabelEncoder

label_encoders = {}

for column in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)

X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']

X_resampled, y_resampled = ros.fit_resample(X, y)

df_balanced = X_resampled.copy()
df_balanced['HeartDisease'] = y_resampled

df = df_balanced
In [64]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(16, 12))

plt.subplot(2, 2, 1)
sns.scatterplot(x='BMI', y='PhysicalHealth', hue='HeartDisease', data=df)
plt.title('BMI vs PhysicalHealth')

plt.subplot(2, 2, 2)
sns.scatterplot(x='MentalHealth', y='SleepTime', hue='HeartDisease', data=df)
plt.title('MentalHealth vs SleepTime')

plt.subplot(2, 2, 3)
sns.scatterplot(x='PhysicalHealth', y='SleepTime', hue='HeartDisease', data=df)
plt.title('PhysicalHealth vs SleepTime')

plt.subplot(2, 2, 4)
sns.scatterplot(x='BMI', y='MentalHealth', hue='HeartDisease', data=df)
plt.title('BMI vs MentalHealth')

In [65]:
plt.figure(figsize=(16, 12))

plt.subplot(2, 2, 1)
sns.scatterplot(x='BMI', y='PhysicalHealth', data=df)
plt.title('BMI vs PhysicalHealth')

plt.subplot(2, 2, 2)
sns.scatterplot(x='MentalHealth', y='SleepTime', data=df)
plt.title('MentalHealth vs SleepTime')

plt.subplot(2, 2, 3)
sns.scatterplot(x='PhysicalHealth', y='SleepTime', data=df)
plt.title('PhysicalHealth vs SleepTime')

plt.subplot(2, 2, 4)
sns.scatterplot(x='BMI', y='MentalHealth', data=df)
plt.title('BMI vs MentalHealth')

In [66]:
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt

X = df[['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime']]

agg_clustering = AgglomerativeClustering(n_clusters=3)
clusters = agg_clustering.fit_predict(X)

df['Cluster'] = clusters

linked = linkage(X, 'ward')

plt.figure(figsize=(10, 7))
In [67]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import MiniBatchKMeans
import matplotlib.pyplot as plt

def draw_data_2d(data, feature_x, feature_y, labels, subplot):
    subplot.scatter(data.iloc[:, feature_x], data.iloc[:, feature_y], c=labels, cmap='viridis')

X = df[['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime']]

kmeans = MiniBatchKMeans(n_clusters=3, random_state=0, batch_size=100)
result = kmeans.fit_predict(X)

plt.figure(figsize=(16, 24))

plt.subplot(4, 2, 1)
draw_data_2d(X, 0, 1, result, plt.subplot(4, 2, 1))
plt.title('Clusters (BMI vs PhysicalHealth)')

plt.subplot(4, 2, 2)
draw_data_2d(X, 0, 1, df['HeartDisease'], plt.subplot(4, 2, 2))
plt.title('True Labels (BMI vs PhysicalHealth)')

plt.subplot(4, 2, 3)
draw_data_2d(X, 2, 3, result, plt.subplot(4, 2, 3))
plt.title('Clusters (MentalHealth vs SleepTime)')

plt.subplot(4, 2, 4)
draw_data_2d(X, 2, 3, df['HeartDisease'], plt.subplot(4, 2, 4))
plt.title('True Labels (MentalHealth vs SleepTime)')

plt.subplot(4, 2, 5)
draw_data_2d(X, 0, 2, result, plt.subplot(4, 2, 5))
plt.title('Clusters (BMI vs MentalHealth)')

plt.subplot(4, 2, 6)
draw_data_2d(X, 0, 2, df['HeartDisease'], plt.subplot(4, 2, 6))
plt.title('True Labels (BMI vs MentalHealth)')

plt.subplot(4, 2, 7)
draw_data_2d(X, 1, 3, result, plt.subplot(4, 2, 7))
plt.title('Clusters (PhysicalHealth vs SleepTime)')

plt.subplot(4, 2, 8)
draw_data_2d(X, 1, 3, df['HeartDisease'], plt.subplot(4, 2, 8))
plt.title('True Labels (PhysicalHealth vs SleepTime)')

In [68]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

def run_kmeans(data, n_clusters, random_state):
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
    labels = kmeans.fit_predict(data)
    centers = kmeans.cluster_centers_
    return labels, centers

def print_cluster_result(data, n_clusters, labels):
    for i in range(n_clusters):
        cluster_indices = [index for index, label in enumerate(labels) if label == i]
        print(f"Cluster {i+1} ({len(cluster_indices)}):")
        print(", ".join(map(str, cluster_indices)))

def draw_cluster_results(data, feature_x, feature_y, labels, centers, subplot):
    subplot.scatter(data.iloc[:, feature_x], data.iloc[:, feature_y], c=labels, cmap='viridis')
    subplot.scatter(centers[:, feature_x], centers[:, feature_y], marker='x', s=200, linewidths=3, color='r')

random_state = 9
n_clusters = 3
labels, centers = run_kmeans(X, n_clusters, random_state)
print_cluster_result(X, n_clusters, labels)

plt.figure(figsize=(16, 12))
draw_cluster_results(X, 0, 1, labels, centers, plt.subplot(2, 2, 1))
draw_cluster_results(X, 2, 3, labels, centers, plt.subplot(2, 2, 2))
draw_cluster_results(X, 0, 2, labels, centers, plt.subplot(2, 2, 3))
draw_cluster_results(X, 1, 3, labels, centers, plt.subplot(2, 2, 4))
Cluster 1 (482):
5, 7, 20, 22, 24, 27, 28, 36, 39, 44, 46, 49, 56, 62, 65, 66, 67, 70, 71, 78, 80, 83, 97, 103, 105, 114, 115, 119, 121, 124, 130, 135, 137, 139, 140, 143, 148, 150, 153, 154, 155, 156, 159, 161, 162, 164, 168, 192, 197, 198, 204, 206, 211, 213, 214, 217, 224, 226, 227, 233, 237, 245, 249, 253, 255, 256, 259, 260, 261, 262, 264, 271, 273, 278, 281, 283, 284, 290, 291, 296, 301, 303, 307, 309, 312, 314, 315, 321, 323, 329, 330, 331, 332, 335, 337, 340, 342, 351, 352, 355, 356, 358, 360, 369, 376, 379, 380, 382, 385, 386, 390, 392, 398, 404, 405, 406, 411, 417, 419, 420, 422, 423, 427, 432, 435, 440, 441, 442, 446, 447, 450, 451, 453, 455, 456, 458, 460, 462, 466, 471, 474, 477, 486, 493, 495, 503, 510, 512, 516, 521, 523, 524, 531, 536, 537, 538, 543, 544, 545, 557, 567, 571, 573, 576, 578, 582, 587, 589, 591, 592, 595, 597, 598, 600, 603, 605, 608, 612, 613, 617, 620, 621, 624, 633, 640, 643, 646, 647, 651, 652, 653, 654, 655, 656, 664, 667, 669, 674, 677, 682, 692, 694, 695, 696, 697, 698, 701, 702, 705, 709, 714, 724, 725, 726, 736, 738, 739, 741, 743, 747, 756, 757, 758, 762, 764, 778, 779, 784, 785, 786, 791, 797, 798, 800, 804, 805, 808, 810, 811, 812, 813, 818, 819, 821, 823, 824, 825, 827, 831, 833, 835, 836, 838, 844, 852, 854, 858, 861, 866, 867, 874, 880, 883, 885, 886, 887, 888, 890, 895, 898, 906, 907, 910, 911, 913, 916, 917, 919, 920, 930, 935, 940, 943, 944, 947, 957, 961, 964, 965, 967, 968, 973, 977, 980, 982, 991, 996, 997, 999, 1001, 1003, 1005, 1013, 1018, 1023, 1025, 1029, 1031, 1035, 1041, 1045, 1046, 1047, 1051, 1053, 1058, 1062, 1063, 1064, 1068, 1070, 1071, 1072, 1073, 1074, 1075, 1076, 1078, 1079, 1081, 1084, 1086, 1089, 1090, 1092, 1095, 1096, 1102, 1106, 1107, 1108, 1110, 1111, 1112, 1115, 1123, 1125, 1126, 1135, 1141, 1147, 1148, 1149, 1150, 1151, 1154, 1156, 1161, 1162, 1166, 1175, 1180, 1181, 1185, 1193, 1198, 1202, 1213, 1216, 1223, 1231, 1232, 1237, 1240, 1241, 1242, 1243, 1245, 1250, 1254, 1255, 1256, 1257, 1261, 1271, 1272, 1277, 1280, 1281, 1282, 1284, 1286, 1289, 1292, 1295, 1300, 1302, 1303, 1306, 1308, 1310, 1313, 1314, 1317, 1320, 1321, 1322, 1324, 1325, 1327, 1328, 1329, 1330, 1332, 1334, 1335, 1341, 1344, 1345, 1354, 1355, 1357, 1358, 1364, 1370, 1371, 1374, 1375, 1380, 1382, 1386, 1387, 1388, 1390, 1391, 1400, 1406, 1407, 1408, 1412, 1415, 1423, 1424, 1427, 1430, 1437, 1438, 1439, 1441, 1443, 1446, 1448, 1456, 1457, 1466, 1472, 1480, 1482, 1487, 1490, 1496, 1499, 1512, 1513, 1515, 1518, 1521, 1523, 1527, 1536, 1541, 1542, 1547, 1551, 1555, 1558, 1560, 1568, 1574, 1575, 1576, 1577
Cluster 2 (286):
2, 4, 19, 25, 37, 42, 47, 50, 58, 59, 64, 69, 72, 73, 82, 89, 108, 110, 118, 126, 129, 142, 144, 151, 179, 189, 199, 201, 202, 220, 221, 228, 241, 244, 251, 252, 254, 257, 265, 268, 277, 286, 288, 294, 297, 304, 306, 313, 316, 324, 347, 366, 370, 373, 374, 377, 397, 409, 413, 418, 433, 454, 463, 479, 501, 505, 506, 518, 526, 530, 540, 551, 555, 556, 559, 562, 568, 569, 579, 596, 599, 602, 607, 616, 628, 636, 649, 670, 672, 675, 676, 685, 690, 700, 727, 755, 770, 775, 793, 802, 837, 845, 860, 862, 869, 875, 878, 893, 896, 903, 915, 923, 932, 933, 937, 939, 949, 960, 966, 969, 970, 972, 975, 976, 978, 981, 984, 985, 988, 989, 990, 1002, 1004, 1008, 1010, 1011, 1012, 1016, 1017, 1028, 1032, 1033, 1038, 1042, 1043, 1048, 1050, 1052, 1054, 1060, 1061, 1066, 1067, 1077, 1080, 1082, 1099, 1100, 1101, 1113, 1117, 1119, 1122, 1124, 1131, 1132, 1134, 1136, 1139, 1140, 1144, 1152, 1153, 1155, 1157, 1169, 1176, 1179, 1182, 1187, 1189, 1190, 1200, 1201, 1204, 1207, 1215, 1218, 1224, 1236, 1238, 1247, 1251, 1252, 1253, 1263, 1267, 1268, 1270, 1288, 1290, 1296, 1297, 1298, 1301, 1307, 1311, 1319, 1336, 1340, 1346, 1348, 1349, 1351, 1353, 1359, 1361, 1363, 1368, 1376, 1377, 1378, 1385, 1394, 1401, 1402, 1404, 1410, 1411, 1414, 1418, 1419, 1420, 1425, 1428, 1429, 1432, 1433, 1434, 1436, 1444, 1449, 1451, 1452, 1458, 1460, 1461, 1463, 1464, 1468, 1469, 1474, 1476, 1481, 1492, 1493, 1500, 1501, 1503, 1504, 1506, 1507, 1509, 1510, 1511, 1516, 1520, 1526, 1530, 1533, 1535, 1539, 1540, 1543, 1553, 1554, 1556, 1559, 1562, 1566, 1569, 1570, 1573, 1580, 1581, 1582
Cluster 3 (816):
0, 1, 3, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 21, 23, 26, 29, 30, 31, 32, 33, 34, 35, 38, 40, 41, 43, 45, 48, 51, 52, 53, 54, 55, 57, 60, 61, 63, 68, 74, 75, 76, 77, 79, 81, 84, 85, 86, 87, 88, 90, 91, 92, 93, 94, 95, 96, 98, 99, 100, 101, 102, 104, 106, 107, 109, 111, 112, 113, 116, 117, 120, 122, 123, 125, 127, 128, 131, 132, 133, 134, 136, 138, 141, 145, 146, 147, 149, 152, 157, 158, 160, 163, 165, 166, 167, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190, 191, 193, 194, 195, 196, 200, 203, 205, 207, 208, 209, 210, 212, 215, 216, 218, 219, 222, 223, 225, 229, 230, 231, 232, 234, 235, 236, 238, 239, 240, 242, 243, 246, 247, 248, 250, 258, 263, 266, 267, 269, 270, 272, 274, 275, 276, 279, 280, 282, 285, 287, 289, 292, 293, 295, 298, 299, 300, 302, 305, 308, 310, 311, 317, 318, 319, 320, 322, 325, 326, 327, 328, 333, 334, 336, 338, 339, 341, 343, 344, 345, 346, 348, 349, 350, 353, 354, 357, 359, 361, 362, 363, 364, 365, 367, 368, 371, 372, 375, 378, 381, 383, 384, 387, 388, 389, 391, 393, 394, 395, 396, 399, 400, 401, 402, 403, 407, 408, 410, 412, 414, 415, 416, 421, 424, 425, 426, 428, 429, 430, 431, 434, 436, 437, 438, 439, 443, 444, 445, 448, 449, 452, 457, 459, 461, 464, 465, 467, 468, 469, 470, 472, 473, 475, 476, 478, 480, 481, 482, 483, 484, 485, 487, 488, 489, 490, 491, 492, 494, 496, 497, 498, 499, 500, 502, 504, 507, 508, 509, 511, 513, 514, 515, 517, 519, 520, 522, 525, 527, 528, 529, 532, 533, 534, 535, 539, 541, 542, 546, 547, 548, 549, 550, 552, 553, 554, 558, 560, 561, 563, 564, 565, 566, 570, 572, 574, 575, 577, 580, 581, 583, 584, 585, 586, 588, 590, 593, 594, 601, 604, 606, 609, 610, 611, 614, 615, 618, 619, 622, 623, 625, 626, 627, 629, 630, 631, 632, 634, 635, 637, 638, 639, 641, 642, 644, 645, 648, 650, 657, 658, 659, 660, 661, 662, 663, 665, 666, 668, 671, 673, 678, 679, 680, 681, 683, 684, 686, 687, 688, 689, 691, 693, 699, 703, 704, 706, 707, 708, 710, 711, 712, 713, 715, 716, 717, 718, 719, 720, 721, 722, 723, 728, 729, 730, 731, 732, 733, 734, 735, 737, 740, 742, 744, 745, 746, 748, 749, 750, 751, 752, 753, 754, 759, 760, 761, 763, 765, 766, 767, 768, 769, 771, 772, 773, 774, 776, 777, 780, 781, 782, 783, 787, 788, 789, 790, 792, 794, 795, 796, 799, 801, 803, 806, 807, 809, 814, 815, 816, 817, 820, 822, 826, 828, 829, 830, 832, 834, 839, 840, 841, 842, 843, 846, 847, 848, 849, 850, 851, 853, 855, 856, 857, 859, 863, 864, 865, 868, 870, 871, 872, 873, 876, 877, 879, 881, 882, 884, 889, 891, 892, 894, 897, 899, 900, 901, 902, 904, 905, 908, 909, 912, 914, 918, 921, 922, 924, 925, 926, 927, 928, 929, 931, 934, 936, 938, 941, 942, 945, 946, 948, 950, 951, 952, 953, 954, 955, 956, 958, 959, 962, 963, 971, 974, 979, 983, 986, 987, 992, 993, 994, 995, 998, 1000, 1006, 1007, 1009, 1014, 1015, 1019, 1020, 1021, 1022, 1024, 1026, 1027, 1030, 1034, 1036, 1037, 1039, 1040, 1044, 1049, 1055, 1056, 1057, 1059, 1065, 1069, 1083, 1085, 1087, 1088, 1091, 1093, 1094, 1097, 1098, 1103, 1104, 1105, 1109, 1114, 1116, 1118, 1120, 1121, 1127, 1128, 1129, 1130, 1133, 1137, 1138, 1142, 1143, 1145, 1146, 1158, 1159, 1160, 1163, 1164, 1165, 1167, 1168, 1170, 1171, 1172, 1173, 1174, 1177, 1178, 1183, 1184, 1186, 1188, 1191, 1192, 1194, 1195, 1196, 1197, 1199, 1203, 1205, 1206, 1208, 1209, 1210, 1211, 1212, 1214, 1217, 1219, 1220, 1221, 1222, 1225, 1226, 1227, 1228, 1229, 1230, 1233, 1234, 1235, 1239, 1244, 1246, 1248, 1249, 1258, 1259, 1260, 1262, 1264, 1265, 1266, 1269, 1273, 1274, 1275, 1276, 1278, 1279, 1283, 1285, 1287, 1291, 1293, 1294, 1299, 1304, 1305, 1309, 1312, 1315, 1316, 1318, 1323, 1326, 1331, 1333, 1337, 1338, 1339, 1342, 1343, 1347, 1350, 1352, 1356, 1360, 1362, 1365, 1366, 1367, 1369, 1372, 1373, 1379, 1381, 1383, 1384, 1389, 1392, 1393, 1395, 1396, 1397, 1398, 1399, 1403, 1405, 1409, 1413, 1416, 1417, 1421, 1422, 1426, 1431, 1435, 1440, 1442, 1445, 1447, 1450, 1453, 1454, 1455, 1459, 1462, 1465, 1467, 1470, 1471, 1473, 1475, 1477, 1478, 1479, 1483, 1484, 1485, 1486, 1488, 1489, 1491, 1494, 1495, 1497, 1498, 1502, 1505, 1508, 1514, 1517, 1519, 1522, 1524, 1525, 1528, 1529, 1531, 1532, 1534, 1537, 1538, 1544, 1545, 1546, 1548, 1549, 1550, 1552, 1557, 1561, 1563, 1564, 1565, 1567, 1571, 1572, 1578, 1579, 1583
array([[34.91558091,  2.09543568,  1.86929461,  7.1659751 ],
       [29.02968531, 23.47202797,  3.75524476,  6.93356643],
       [25.48561275,  1.03186275,  0.88480392,  7.48039216]])
0       0
1       0
2       0
3       1
4       0
1579    1
1580    1
1581    1
1582    1
1583    1
Name: HeartDisease, Length: 1584, dtype: int64
In [69]:
from sklearn.decomposition import PCA

reduced_data = PCA(n_components=2).fit_transform(X)

[[-5.84622064 -8.51798399]
 [-5.7037399  -4.66415042]
 [21.96082207 -6.57635165]
 [24.02481154 -4.69225678]
 [24.02481154 -4.69225678]
 [-5.69844894 -3.81424256]]
In [70]:
import matplotlib.pyplot as plt
import pandas as pd

plt.figure(figsize=(16, 6))

    pd.DataFrame({"Column1": reduced_data[:, 0], "Column2": reduced_data[:, 1]}),
    labels=[0] * len(reduced_data),  
    subplot=plt.subplot(1, 2, 1),
plt.title('PCA Data (Without True Labels)')

    pd.DataFrame({"Column1": reduced_data[:, 0], "Column2": reduced_data[:, 1]}),
    plt.subplot(1, 2, 2),
plt.title('PCA Data (With True Labels)')

In [71]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

def fit_kmeans(data, n_clusters, random_state):
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
    return kmeans

def draw_clusters(data, kmeans):
    labels = kmeans.labels_
    centers = kmeans.cluster_centers_
    plt.figure(figsize=(10, 6))
    plt.scatter(data[:, 0], data[:, 1], c=labels, cmap='viridis', alpha=0.6)
    plt.scatter(centers[:, 0], centers[:, 1], marker='x', s=200, linewidths=3, color='r')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.title('K-means Clustering (2 Clusters) with PCA')

kmeans = fit_kmeans(reduced_data, 2, random_state)
draw_clusters(reduced_data, kmeans)
In [72]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

def fit_kmeans(data, n_clusters, random_state):
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
    return kmeans

def draw_clusters(data, kmeans):
    labels = kmeans.labels_
    centers = kmeans.cluster_centers_
    plt.figure(figsize=(10, 6))
    plt.scatter(data[:, 0], data[:, 1], c=labels, cmap='viridis', alpha=0.6)
    plt.scatter(centers[:, 0], centers[:, 1], marker='x', s=200, linewidths=3, color='white')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.title('K-means Clustering (PCA-reduced data)')

kmeans = fit_kmeans(reduced_data, 3, random_state)
draw_clusters(reduced_data, kmeans)
In [73]:
import matplotlib.pyplot as plt
import pandas as pd

labels = [2 if val == 1 else 1 if val == 2 else val for val in kmeans.labels_]

plt.figure(figsize=(16, 6))

    pd.DataFrame({"Column1": reduced_data[:, 0], "Column2": reduced_data[:, 1]}),
    plt.subplot(1, 2, 1),
plt.title('Clusters (PCA-reduced data)')

    pd.DataFrame({"Column1": reduced_data[:, 0], "Column2": reduced_data[:, 1]}),
    plt.subplot(1, 2, 2),
plt.title('True Labels (PCA-reduced data)')

In [74]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

def get_clusters_inertia(data, random_state, max_clusters=10):
    inertias = []
    clusters_range = range(1, max_clusters + 1)
    for n_clusters in clusters_range:
        kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
    return inertias, clusters_range

def draw_elbow_diagram(inertias, clusters_range):
    plt.figure(figsize=(10, 6))
    plt.plot(clusters_range, inertias, marker='o')
    plt.xlabel('Number of Clusters')
    plt.title('Elbow Method for Optimal Number of Clusters')

random_state = 42
max_clusters = 10

inertias, clusters_range = get_clusters_inertia(reduced_data, random_state, max_clusters)

print("Clusters Range:", list(clusters_range))
print("Inertias:", inertias)

draw_elbow_diagram(inertias, clusters_range)
Clusters Range: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
Inertias: [188339.3786248587, 71215.42904037607, 45411.799539507614, 30940.91324871111, 25825.44168747253, 20648.77565717782, 18422.523745341146, 16603.274611675115, 14473.722814510891, 12572.350229277426]
In [75]:
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np

def get_clusters_silhouette_scores(data, random_state, max_clusters=10):
    silhouette_scores = []
    clusters_range = range(2, max_clusters + 1)
    for n_clusters in clusters_range:
        kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
        labels = kmeans.fit_predict(data)
        score = silhouette_score(data, labels)
    return silhouette_scores, clusters_range

def draw_silhouettes_diagram(silhouette_scores, clusters_range):
    plt.figure(figsize=(10, 6))
    plt.plot(clusters_range, silhouette_scores, marker='o')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Silhouette Score')
    plt.title('Silhouette Analysis for Optimal Number of Clusters')

random_state = 42
max_clusters = 10

scaler = StandardScaler()
reduced_data = scaler.fit_transform(reduced_data)

silhouette_scores, clusters_range = get_clusters_silhouette_scores(reduced_data, random_state, max_clusters)

print("Clusters Range:", list(clusters_range))
print("Silhouette Scores:", silhouette_scores)

draw_silhouettes_diagram(silhouette_scores, clusters_range)
Clusters Range: [2, 3, 4, 5, 6, 7, 8, 9, 10]
Silhouette Scores: [np.float64(0.4061232673047139), np.float64(0.5161974215603345), np.float64(0.41688933894376207), np.float64(0.4317880554700233), np.float64(0.43244492414105384), np.float64(0.4520139981348476), np.float64(0.42287542835353636), np.float64(0.4290740936057577), np.float64(0.45589103429651956)]
In [76]:
import math
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import KMeans
import numpy as np

def get_clusters_silhouettes(data, random_state):
    max_clusters = min(12, int(math.sqrt(len(data))))
    silhouettes = []
    for n_clusters in range(2, max_clusters + 1):
        kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
        cluster_labels = kmeans.fit_predict(data)
        silhouette_avg = silhouette_score(data, cluster_labels)
        silhouettes.append((n_clusters, silhouette_avg, cluster_labels, kmeans.cluster_centers_))
    return silhouettes

def draw_silhouettes(data, silhouettes):
    for n_clusters, silhouette_avg, cluster_labels, centers in silhouettes:
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
        ax1.set_xlim([-0.1, 1])
        ax1.set_ylim([0, len(data) + (n_clusters + 1) * 10])
        silhouette_values = silhouette_samples(data, cluster_labels)
        y_lower = 10
        for i in range(n_clusters):
            cluster_silhouette_values = silhouette_values[cluster_labels == i]
            cluster_size = cluster_silhouette_values.shape[0]
            y_upper = y_lower + cluster_size
            ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, cluster_silhouette_values, alpha=0.7)
            ax1.text(-0.05, y_lower + 0.5 * cluster_size, str(i))
            y_lower = y_upper + 10
        ax1.set_title("Silhouette plot for {} clusters".format(n_clusters))
        ax1.set_xlabel("Silhouette coefficient values")
        ax1.set_ylabel("Cluster label")
        ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
        ax2.scatter(data[:, 0], data[:, 1], c=cluster_labels, cmap='viridis', s=30)
        ax2.scatter(centers[:, 0], centers[:, 1], marker='x', c='red', s=200, alpha=1)
        ax2.set_title("Clustered data for {} clusters".format(n_clusters))
        ax2.set_xlabel("Feature space for the 1st feature")
        ax2.set_ylabel("Feature space for the 2nd feature")
        plt.suptitle(("Silhouette analysis for KMeans clustering with {} clusters".format(n_clusters)),
                     fontsize=14, fontweight='bold')

random_state = 42

silhouettes = get_clusters_silhouettes(reduced_data, random_state)

draw_silhouettes(reduced_data, silhouettes)
