from typing import Any, Dict, List import matplotlib.cm as cm import matplotlib.pyplot as plt import numpy as np from pandas import DataFrame from scipy.cluster import hierarchy from sklearn.cluster import KMeans def draw_data_2d( df: DataFrame, col1: int, col2: int, y: List | None = None, classes: List | None = None, subplot: Any | None = None, ): ax = None if subplot is None: _, ax = plt.subplots() else: ax = subplot scatter = ax.scatter(df[df.columns[col1]], df[df.columns[col2]], c=y) ax.set(xlabel=df.columns[col1], ylabel=df.columns[col2]) if classes is not None: ax.legend( scatter.legend_elements()[0], classes, loc="lower right", title="Classes" ) def draw_dendrogram(linkage_matrix: np.ndarray): hierarchy.dendrogram(linkage_matrix, truncate_mode="level", p=3) def draw_cluster_results( df: DataFrame, col1: int, col2: int, labels: np.ndarray, cluster_centers: np.ndarray, subplot: Any | None = None, ): ax = None if subplot is None: ax = plt else: ax = subplot centroids = cluster_centers u_labels = np.unique(labels) for i in u_labels: ax.scatter( df[labels == i][df.columns[col1]], df[labels == i][df.columns[col2]], label=i, ) ax.scatter(centroids[:, col1], centroids[:, col2], s=80, color="k") def draw_clusters(reduced_data: np.ndarray, kmeans: KMeans): h = 0.02 x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1 y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.figure(1) plt.clf() plt.imshow( Z, interpolation="nearest", extent=(xx.min(), xx.max(), yy.min(), yy.max()), cmap=plt.cm.Paired, # type: ignore aspect="auto", origin="lower", ) plt.plot(reduced_data[:, 0], reduced_data[:, 1], "k.", markersize=2) centroids = kmeans.cluster_centers_ plt.scatter( centroids[:, 0], centroids[:, 1], marker="x", s=169, linewidths=3, color="w", zorder=10, ) plt.title( "K-means clustering (PCA-reduced data)\n" "Centroids are marked with white cross" ) plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.xticks(()) plt.yticks(()) def _draw_cluster_scores( data: List, clusters_range: range, score_name: str, title: str, ): plt.figure(figsize=(8, 5)) plt.plot(clusters_range, data, "bo-") plt.xlabel("$k$", fontsize=8) plt.ylabel(score_name, fontsize=8) plt.title(title) def draw_elbow_diagram(inertias: List, clusters_range: range): _draw_cluster_scores(inertias, clusters_range, "Inertia", "The Elbow Diagram") def draw_silhouettes_diagram(silhouette: List, clusters_range: range): _draw_cluster_scores( silhouette, clusters_range, "Silhouette score", "The Silhouette score" ) def _draw_silhouette( ax: Any, reduced_data: np.ndarray, n_clusters: int, silhouette_avg: float, sample_silhouette_values: List, cluster_labels: List, ): ax.set_xlim([-0.1, 1]) ax.set_ylim([0, len(reduced_data) + (n_clusters + 1) * 10]) y_lower = 10 for i in range(n_clusters): ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.nipy_spectral(float(i) / n_clusters) # type: ignore ax.fill_betweenx( np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7, ) ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) y_lower = y_upper + 10 # 10 for the 0 samples ax.set_title("The silhouette plot for the various clusters.") ax.set_xlabel("The silhouette coefficient values") ax.set_ylabel("Cluster label") ax.axvline(x=silhouette_avg, color="red", linestyle="--") ax.set_yticks([]) ax.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) def _draw_cluster_data( ax: Any, reduced_data: np.ndarray, n_clusters: int, cluster_labels: np.ndarray, cluster_centers: np.ndarray, ): colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters) # type: ignore ax.scatter( reduced_data[:, 0], reduced_data[:, 1], marker=".", s=30, lw=0, alpha=0.7, c=colors, edgecolor="k", ) ax.scatter( cluster_centers[:, 0], cluster_centers[:, 1], marker="o", c="white", alpha=1, s=200, edgecolor="k", ) for i, c in enumerate(cluster_centers): ax.scatter(c[0], c[1], marker="$%d$" % i, alpha=1, s=50, edgecolor="k") ax.set_title("The visualization of the clustered data.") ax.set_xlabel("Feature space for the 1st feature") ax.set_ylabel("Feature space for the 2nd feature") def draw_silhouettes(reduced_data: np.ndarray, silhouettes: Dict): for key, value in silhouettes.items(): fig, (ax1, ax2) = plt.subplots(1, 2) fig.set_size_inches(18, 7) n_clusters = key silhouette_avg = value[0] sample_silhouette_values = value[1] cluster_labels = value[2].labels_ cluster_centers = value[2].cluster_centers_ _draw_silhouette( ax1, reduced_data, n_clusters, silhouette_avg, sample_silhouette_values, cluster_labels, ) _draw_cluster_data( ax2, reduced_data, n_clusters, cluster_labels, cluster_centers, ) plt.suptitle( "Silhouette analysis for KMeans clustering on sample data with n_clusters = %d" % n_clusters, fontsize=14, fontweight="bold", )