39 lines
1.6 KiB
Python
39 lines
1.6 KiB
Python
|
import pandas as pd
|
||
|
import matplotlib.pyplot as plt
|
||
|
from scipy.cluster.hierarchy import dendrogram, linkage
|
||
|
from sklearn.cluster import AgglomerativeClustering
|
||
|
from sklearn.metrics import silhouette_score
|
||
|
|
||
|
# Загрузка данных
|
||
|
data = pd.read_csv("smoking_drinking_dataset.csv")
|
||
|
|
||
|
# Уменьшаем размер данных
|
||
|
data = data.sample(frac=0.01, random_state=42)
|
||
|
|
||
|
# Выбор подмножества признаков для кластеризации
|
||
|
features = ['age', 'height', 'weight', 'waistline', 'sight_left', 'sight_right', 'hear_left', 'hear_right', 'SBP', 'DBP', 'BLDS', 'tot_chole', 'HDL_chole', 'LDL_chole', 'triglyceride', 'hemoglobin', 'urine_protein', 'serum_creatinine', 'SGOT_AST', 'SGOT_ALT', 'gamma_GTP']
|
||
|
|
||
|
# Выбор данных для кластеризации
|
||
|
cluster_data = data[features]
|
||
|
|
||
|
# Нормализация данных
|
||
|
normalized_data = (cluster_data - cluster_data.mean()) / cluster_data.std()
|
||
|
|
||
|
# Расчет матрицы связи
|
||
|
linkage_matrix = linkage(normalized_data, method='ward')
|
||
|
|
||
|
# Построение dendrogram
|
||
|
plt.figure(figsize=(16, 8))
|
||
|
dendrogram(linkage_matrix, orientation='top', labels=data.index, distance_sort='descending', show_leaf_counts=True)
|
||
|
plt.title('Дендрограма')
|
||
|
plt.ylabel('Дистанция')
|
||
|
#plt.show()
|
||
|
plt.savefig("dendr.png")
|
||
|
|
||
|
# Оценка метода
|
||
|
n_clusters = 4
|
||
|
cluster_model = AgglomerativeClustering(n_clusters=n_clusters, metric='euclidean', linkage='ward')
|
||
|
labels = cluster_model.fit_predict(normalized_data)
|
||
|
data['cluster'] = labels
|
||
|
silhouette_avg = silhouette_score(normalized_data, labels)
|
||
|
print(f"Silhouette Score: {silhouette_avg}")
|