44 lines
1.5 KiB
Python
44 lines
1.5 KiB
Python
|
import pandas as pd
|
|||
|
import matplotlib.pyplot as plt
|
|||
|
from sklearn.cluster import KMeans
|
|||
|
from sklearn.metrics import silhouette_score
|
|||
|
|
|||
|
data = pd.read_csv('tutors.csv', index_col=0)
|
|||
|
data = data.dropna()
|
|||
|
features = ['Price', 'Score', 'Reviews_number', 'Experience']
|
|||
|
X = data[features].values
|
|||
|
|
|||
|
# метод K-means
|
|||
|
kmeans = KMeans(n_clusters=3)
|
|||
|
kmeans.fit(X)
|
|||
|
|
|||
|
# получение меток кластеров
|
|||
|
labels = kmeans.labels_
|
|||
|
data['Cluster'] = labels
|
|||
|
|
|||
|
# получение центров кластеров
|
|||
|
cluster_centers = kmeans.cluster_centers_
|
|||
|
|
|||
|
# подсчет значений в каждом кластере
|
|||
|
cluster_counts = data['Cluster'].value_counts()
|
|||
|
|
|||
|
for cluster in sorted(data['Cluster'].unique()):
|
|||
|
cluster_data = data[data['Cluster'] == cluster]
|
|||
|
print(f"-----Cluster № {cluster + 1} -----")
|
|||
|
print(f"Number of tutors: {cluster_counts[cluster]}")
|
|||
|
print("Cluster centers:")
|
|||
|
for f, c in zip(features, cluster_centers[cluster]):
|
|||
|
print(f"{f} - {c:.2f}")
|
|||
|
print()
|
|||
|
|
|||
|
# оценка качества с помощью коэффициента силуэта
|
|||
|
silhouette = silhouette_score(X, labels)
|
|||
|
print(f"Коэффициент силуэта: {silhouette:.2f}")
|
|||
|
|
|||
|
# отображение результатов на графике
|
|||
|
plt.scatter(data['Score'], data['Price'], c=labels)
|
|||
|
plt.scatter(cluster_centers[:, 1], cluster_centers[:, 0], marker='x', color='red')
|
|||
|
plt.xlabel('Score')
|
|||
|
plt.ylabel('Price')
|
|||
|
plt.title('K-means clustering')
|
|||
|
plt.show()
|